# Libraries
library(here)
library(vroom)
library(knitr)
library(kableExtra)
library(lubridate)
library(viridis)
library(janitor)
library(sciplot)
library(meta)
library(broom)
library(car)
library(pastecs)
library(qqman)
library(ggrepel)
library(ez)
library(MASS)
library(DT)
library(tidyverse)

# library(rsq)
# library(ROCR)
# library(fitdistrplus)

knitr::opts_chunk$set(fig.width = 8.5)

# Set theme for ggplot
theme_set(theme_bw())

GWAS Code

We first ran a genome-wide association study in the UK Biobank European cohort with gout as the outcome. This was done with a total of 27,287,012 variants (after imputation), and adjusted for age, sex, and the first 40 principal components. Initially, I did the following to clean up this summary stats file:

  1. I read in the summary statistics.

  2. I then did some initial filtering to remove X and Y chromosome SNPs, and some of the indels.

  3. I removed variants that were not in the imputed CoreExome genotype data.

  4. I then filtered out multi-allelic variants and variants with MAF < 0.01.

  5. For the genotyped PRS, I additionally filtered out variants that were not genotyped in the CoreExome.

# Reading in summary stats
if(file.exists(here("Output/Temp/sumstat5.RData"))) {
  #load(here("Output/Temp/sumstat5.RData"))
} else {
  if(file.exists(here("Output/Temp/sumstat3.RData"))) {
    load(here("Output/Temp/sumstat3.RData"))
  } else {
    sumstat <- vroom(here("Data/GWAS/ukbb_gout-allcontrol_chr1-22.X.XY.add_unfiltered_p.tsv"), delim = "\t", col_names = TRUE) # start with 27,287,012 variants in the summary stats
  
    sumstat2 <- sumstat %>% 
      filter(str_length(A1) == 1,
             CHR %in% 1:22) # Removing 738,219 indels and keeping only autosomes (removes a further 1,069,966 variants) = 25,478,827 variantsco
    
    sumstat2_1 <- sumstat2 %>% 
      mutate(SNP1 = sumstat2 %>% pull(SNP) %>% str_split(",", n = 2, simplify = TRUE) %>% .[,1],
             SNP2 = sumstat2 %>% pull(SNP) %>% str_split(",", n = 2, simplify = TRUE) %>% .[,2]) # this takes forever to run
    
    # test <- sumstat2_1 %>% 
    #   filter(str_detect(SNP2, ",")) %>% 
    #   separate(SNP2, into = c("SNP2", "extra"), sep = ",") # 1,696 variants had an extra comma in their ID
    # 
    # test1 <- test %>% 
    #   mutate(extra = as.numeric(extra)) %>% 
    #   filter(extra != CHR) # All just had their chromosome attached, can just remove the extra bit
    
    sumstat2_2 <- sumstat2 %>% 
      mutate(SNP1 = sumstat2 %>% pull(SNP) %>% str_split(",", simplify = TRUE) %>% .[,1],
             SNP2 = sumstat2 %>% pull(SNP) %>% str_split(",", simplify = TRUE) %>% .[,2])
    
    # test <- sumstat2_2 %>% filter(str_detect(SNP2, ",")) # none with ,
    
    test1 <- sumstat2_2 %>% 
      filter(str_detect(SNP1, regex("^rs[0-9]+"))) # 24,248,522 variants have an rsID in SNP1 column
    
    # test1_1 <- test1 %>% 
    #   filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 633,645 of these have an rsID in SNP2 column
    # 
    # test1_1_1 <- test1_1 %>%
    #   filter(SNP1 != SNP2) # 1,367 variants have two rsIDs, for the most part SNP1 appears to be the newest rsID, but I might want to keep the extra rsID in a separate column
    
    test1_2 <- test1 %>% 
      filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 23,610,697 variants have rsID in SNP1 column and chr:bp_a1_a2 in SNP2 column
    
    test1_2_1 <- test1_2 %>% 
      mutate(CHR2 = test1_2 %>% pull(SNP2) %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
             BP2 = test1_2 %>% pull(SNP2) %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
             Allele1 = test1_2 %>% pull(SNP2) %>% str_split("_", simplify = TRUE) %>% .[,2],
             Allele2 = test1_2 %>% pull(SNP2) %>% str_split("_", simplify = TRUE) %>% .[,3])
    
    # tmp <- test1_2_1 %>% 
    #   filter(BP != BP2) # all BPs are equal
    # 
    # tmp <- test1_2_1 %>% 
    #   filter(CHR != CHR2) # all CHRs are equal
    # 
    # tmp <- test1_2_1 %>% 
    #   filter(A1 != Allele2) # Allele2 is not always A1
    
    test1_2_2 <- test1_2_1 %>% 
      select(-CHR2, -BP2) %>% 
      filter(str_length(Allele1) == 1,
             str_length(Allele2) == 1) # removes a further 219,771 indels = 23,390,926 SNPs
    
    test1_2_final <- test1_2_2 %>% 
      select(SNP, Allele1, Allele2)
    
    # test1_3 <- test1 %>% 
    #   filter(!str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$|^rs[0-9]+$"))) # 4,180 variants have neither rsID nor chr:bp_a1_a2 in SNP2 column
    # 
    # test1_4 <- test1_3 %>% 
    #   filter(!str_detect(SNP2, regex("^Affx-[0-9]+$"))) # All of these are in the format Affx-<number>
    
    test1_final <- test1 %>% 
      mutate(RSID = SNP1,
             ALT_RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) & SNP1 != SNP2 ~ SNP2, TRUE ~ NA_character_),
             AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
             SNP_ID = case_when(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP2, TRUE ~ NA_character_)) %>% 
      left_join(test1_2_final, by = "SNP")
    
    test1_final2 <- test1_final %>% 
      filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1)))
    
    # nrow(test1_final2) - nrow(test1_final) # successfully removed indels from final dataset
    
    # sum(!is.na(test1_final2$RSID))
    # sum(!is.na(test1_final2$ALT_RSID))
    # sum(!is.na(test1_final2$AFFYID))
    # sum(!is.na(test1_final2$SNP_ID))
    # sum(!is.na(test1_final2$Allele1))
    # sum(!is.na(test1_final2$Allele2))
    
    test2 <- sumstat2_2 %>% 
      filter(!str_detect(SNP1, regex("^rs[0-9]+"))) # 1,230,305 variants don't have an rsID in SNP1 column
    
    test2_1 <- test2 %>% 
      filter(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 1,230,053 have the SNP_ID format in SNP1
    
    test2_1_1 <- test2_1 %>% 
      mutate(CHR2 = test2_1 %>% pull(SNP1) %>% str_split(":", simplify = TRUE) %>% .[,1] %>% as.numeric(),
             BP2 = test2_1 %>% pull(SNP1) %>% str_split(":", simplify = TRUE) %>% .[,2] %>% str_split("_", simplify = TRUE) %>% .[,1] %>% as.numeric(),
             Allele1 = test2_1 %>% pull(SNP1) %>% str_split("_", simplify = TRUE) %>% .[,2],
             Allele2 = test2_1 %>% pull(SNP1) %>% str_split("_", simplify = TRUE) %>% .[,3])
    
    # tmp <- test2_1_1 %>%
    #   filter(BP != BP2) # all BPs are equal
    # 
    # tmp <- test2_1_1 %>%
    #   filter(CHR != CHR2) # all CHRs are equal
    
    test2_1_2 <- test2_1_1 %>% 
      select(-CHR2, -BP2) %>% 
      filter(str_length(Allele1) == 1,
             str_length(Allele2) == 1)
    
    # nrow(test2_1_2) - nrow(test2_1_1) # removed 975,583
    
    test2_1_final <- test2_1_2 %>% 
      select(SNP, Allele1, Allele2)
    
    # test2_1_1 <- test2_1 %>% 
    #   filter(str_detect(SNP2, regex("^rs[0-9]+"))) # 23 of these have RSID in SNP2
    
    # test2_1_2 <- test2_1 %>% 
    #   filter(str_detect(SNP2, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$"))) # 1,230,029 of these have SNP_ID in SNP2
    # 
    # test2_1_2_1 <- test2_1_2 %>% 
    #   filter(SNP1 != SNP2) # All SNP_ID columns identical in these individuals
    
    # test2_1_3 <- test2_1 %>% 
    #   filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # 1 has AffyID in SNP2
      
    
    # test2_2 <- test2 %>% 
    #   filter(str_detect(SNP1, regex("^Affx-[0-9]+$"))) # 252 have an AffyID in SNP1
    # 
    # test2_2_1 <- test2_2 %>% 
    #   filter(str_detect(SNP2, regex("^Affx-[0-9]+$"))) # All have AffyID in SNP2
    # 
    # test2_2_1_1 <- test2_2_1 %>% 
    #   filter(SNP1 != SNP2) # All AffyID columns identical in these individuals
    
    test2_final <- test2 %>% 
      mutate(RSID = case_when(str_detect(SNP2, regex("^rs[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
             ALT_RSID = NA_character_,
             AFFYID = case_when(str_detect(SNP2, regex("^Affx-[0-9]+$")) ~ SNP2, TRUE ~ NA_character_),
             SNP_ID = case_when(str_detect(SNP1, regex("^[0-9]+:[0-9]+_[ACGT]+_[ACGT]+$")) ~ SNP1, TRUE ~ NA_character_)) %>% 
      left_join(test2_1_final, by = "SNP")
    
    test2_final2 <- test2_final %>% 
      filter(is.na(SNP_ID) | (!is.na(SNP_ID) & !is.na(Allele1)))
    
    sumstat3 <- rbind(test1_final2, test2_final2) %>% arrange(CHR, BP)
    
    save(sumstat3, file = here("Output/Temp/sumstat3.RData"))
  }
  
  # tmp <- sumstat3 %>%
  #  select(CHR, BP) %>%
  #  unique()
  
  # for(i in 1:22) {
  #   write_delim(select(filter(tmp, CHR == i), BP), file = paste0(here("Output/Temp/"), "chr", i, "_snplist.txt"), delim = "\n")
  # }
  
  # Now we will work out which of these SNPs are present in the Imputed CoreEx data
  
  #system(paste0('source ~/.bashrc; parallel "zcat /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Imputed_Genotypes/QC1-10_Impute_EUR_only/CZ-MB1.2-QC1.10_EUR_imputed_chr{}.vcf.gz | grep -v ', "'#'",' | cut -f2 | grep -Fwf ', here("Output/Temp/"),'chr{}_snplist.txt > ', here("Output/Temp/"),'matched_snps_chr{}.txt" ::: {1..22}'))
  
  out <- tibble()
  for(i in 1:22) {
    assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("matched_snps_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
    out <- rbind(out, get(paste0("chr", i, "_snps")))
    rm(list = paste0("chr", i, "_snps"), i)
  }
  
  out <- out %>% 
    select(CHR, X1) %>% 
    mutate(C_B = paste0(CHR, "_", X1))
  
  sumstat4 <- sumstat3 %>% 
    mutate(C_B = paste0(CHR, "_", BP)) %>% 
    filter(C_B %in% out$C_B)
  
  rm(sumstat3)
  
  # Now to pull out the MAF for all SNPs
  # Already have CHR and BP for all SNPs separated by chr in matched_snps_chr* and mfi files are per chr so can simply match location in grep
  
  #system(paste0('source ~/.bashrc; parallel "grep -Fwhf ', here("Output/Temp/"), 'matched_snps_chr{}.txt /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/splits/ukb_mfi_chr{}_v3.txt > ', here("Output/Temp/"), 'ukb_maf_info_chr{}.txt" ::: {1..22}'))
  
  out <- tibble()
  for(i in 1:22) {
    assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
    out <- rbind(out, get(paste0("chr", i, "_snps")))
    rm(list = paste0("chr", i, "_snps"), i)
  }
  
  colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
  
  # test <- out %>% 
  #   filter(is.na(Allele1) | is.na(Allele2))
  
  out <- out %>% 
    mutate(C_B = paste0(CHR, "_", BP)) %>% 
    filter(str_length(Allele1) == 1,
           str_length(Allele2) == 1,
           MAF > 0.01,
           MAF < 0.99,
           INFO > 0.3)
  
  out2 <- out %>% 
    filter(C_B %in% sumstat4$C_B) %>% 
    arrange(CHR, BP)
  
  sumstat5 <- sumstat4 %>% 
    semi_join(out2, by = c("CHR", "BP"))
  
  save(sumstat5, file = here("Output/Temp", "sumstat5.RData"))
}

# Pulling out biallelic SNPs only and adding MAF and INFO columns, also making sure MAF etc are correct for allele1/2
if(file.exists(here("Output/Temp/biallelic_sumstat_final.RData"))) {
  # load(here("Output/Temp/biallelic_sumstat_final.RData"))
  # load(here("Output/Temp/biallelic_sumstat_final_poly.RData"))
} else {
  tmp <- sumstat5 %>% 
    filter(duplicated(C_B))
  
  sumstat5_biallelic <- sumstat5 %>% 
    filter(!(C_B %in% tmp$C_B))
  
  out <- tibble()
  for(i in 1:22) {
    assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
    out <- rbind(out, get(paste0("chr", i, "_snps")))
    rm(list = paste0("chr", i, "_snps"), i)
  }
  
  colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
  
  out <- out %>% 
    mutate(C_B = paste0(CHR, "_", BP)) %>% 
    filter(str_length(Allele1) == 1,
           str_length(Allele2) == 1,
           MAF > 0.01,
           MAF < 0.99,
           INFO > 0.3)
  
  out2 <- out %>% 
    filter(C_B %in% sumstat5_biallelic$C_B) %>% 
    arrange(CHR, BP)
  
  tmp <- out2 %>% 
    filter(duplicated(C_B))
  
  out2 <- out2 %>% 
    filter(!(C_B %in% tmp$C_B))
    
  sumstat5_biallelic <- sumstat5_biallelic %>% 
    left_join(out2, by = c("CHR", "BP"))
    
  sumstat5_biallelic <- sumstat5_biallelic %>% 
    semi_join(out2, by = c("CHR", "BP"))
  
  # test <- sumstat5_biallelic %>% 
  #   filter(str_length(Allele1.x) == 1,
  #          str_length(Allele1.y) == 1)
  # 
  # test2 <- sumstat5_biallelic %>% 
  #   filter(!(C_B.x %in% test$C_B.x))
  # 
  # test3 <- test2 %>% 
  #   filter(!is.na(Allele1.x)) # This just removed all of the SNPs without SNP_ID column
  
  # sum(sumstat5_biallelic$SNP1.x != sumstat5_biallelic$SNP2.y) # only 5 SNPs don't match by name
  test <- sumstat5_biallelic %>% 
    filter(sumstat5_biallelic$SNP1.x != sumstat5_biallelic$SNP2.y) # 4 are multiallelic, one is an indel (looked it up in dbSNP)
  extra_multiallelic1 <- test %>% 
    slice(-2)
  
  #sum(sumstat5_biallelic$Allele2.x != sumstat5_biallelic$Allele2.y, na.rm = T) # 878 don't match
  test <- sumstat5_biallelic %>% 
    filter(sumstat5_biallelic$Allele2.x != sumstat5_biallelic$Allele2.y) # these are all mismatched alleles (i.e. multi-allellic)
  extra_multiallelic2 <- sumstat5_biallelic %>%
    filter((Allele1.x == Allele1.y & Allele2.x != Allele2.y) | (Allele1.x != Allele1.y & Allele2.x == Allele2.y))
  
  sumstat5_biallelic1 <- sumstat5_biallelic %>% 
    filter(is.na(Allele1.x),
           !(SNP %in% extra_multiallelic2$SNP)) %>% 
    rename(Allele1 = Allele1.y,
           Allele2 = Allele2.y,
           Effect_Allele = A1) %>% 
    select(-Allele1.x, -Allele2.x)
  
  sumstat5_biallelic2 <- sumstat5_biallelic %>% 
    filter(Allele1.x == Allele1.y,
           Allele2.x == Allele2.y,
           !(SNP %in% extra_multiallelic2$SNP)) %>% 
    rename(Allele1 = Allele1.x,
           Allele2 = Allele2.x,
           Effect_Allele = A1) %>% 
    select(-Allele1.y, -Allele2.y)
  
  # test <- sumstat5_biallelic %>% 
    # filter(!(SNP %in% sumstat5_biallelic1$SNP) & !(SNP %in% sumstat5_biallelic2$SNP))
  
  sumstat5_biallelic3 <- sumstat5_biallelic2 %>% 
    select(CHR:SNP_ID, C_B.x:SNP2.y, Allele1, Allele2, MAF:C_B.y) %>% 
    rbind(sumstat5_biallelic1) %>% 
    arrange(CHR, BP)
  
  test <- sumstat5_biallelic3 %>% 
    filter(Minor_Allele != Effect_Allele)
  
  #summary(test$MAF) # all really close to 0.5 MAF, just need to flip OR, L95, and U95 then set Effect_Allele to Minor_Allele column
  
  test <- test %>% 
    mutate(OR = 1/OR,
           tmp = 1/L95,
           tmp2 = 1/U95,
           L95 = tmp2,
           U95 = tmp,
           Effect_Allele = Minor_Allele) %>% 
    rename(EAF = MAF) %>% 
    select(-tmp, -tmp2, -Minor_Allele)
  
  sumstat5_biallelic4 <- sumstat5_biallelic3 %>% 
    filter(Minor_Allele == Effect_Allele) %>% 
    select(-Minor_Allele) %>% 
    rename(EAF = MAF) %>% 
    rbind(test) %>% 
    arrange(CHR, BP)
  
  test <- sumstat5_biallelic4 %>% 
    filter(Allele2 == Effect_Allele) %>% 
    rename(Alternate_Allele = Allele1) %>% 
    select(CHR:Effect_Allele, Alternate_Allele, TEST:SNP2.y, EAF:C_B.y)
  
  test2 <- sumstat5_biallelic4 %>% 
    filter(Allele1 == Effect_Allele) %>% 
    rename(Alternate_Allele = Allele2) %>% 
    select(CHR:Effect_Allele, Alternate_Allele, TEST:SNP2.y, EAF:C_B.y)
  
  sumstat5_biallelic5 <- rbind(test, test2) %>% 
    arrange(CHR, BP)
  
  biallelic_sumstat_final <- sumstat5_biallelic5
  
  # tmp <- biallelic_sumstat_final %>% 
  #   filter(C_B.x != C_B.y)
  # 
  # tmp <- biallelic_sumstat_final %>% 
  #   filter(SNP1.x != SNP2.y) # Only non-matching site was an indel (can remove)
  # 
  # tmp <- biallelic_sumstat_final %>% 
  #   filter(SNP2.x != SNP1.y) # This includes the non-matching site from above and every other site is just wrong in the SNP1.y column (but they are the same SNP) => keep SNP2.x
  
  biallelic_sumstat_final <- biallelic_sumstat_final %>% 
    filter(SNP1.x == SNP2.y) %>% 
    select(-C_B.x, -C_B.y, -SNP2.y, -SNP1.y) %>% 
    rename(SNP1 = SNP1.x,
           SNP2 = SNP2.x)
  
  
  
  # Removing variants that aren't in the directly genotyped CoreEx data (for Polynesian analysis)
  # snplist_plink2 <- biallelic_sumstat_final %>% 
  #   mutate(BP2 = BP) %>% 
  #   select(CHR, BP, BP2, SNP)
  
  #write_delim(snplist_plink2, delim = " ", file = here("Output/Temp/snplist_plink2.txt"), col_names = F)
  
  # First filtering CoreEx files to only include individuals of interest and SNPs with a callrate of 95%
  CoreExPheno <- read_delim(here("Data/Phenotypes/CZ-MB1.2-QC1.10_MergedPhenotypes_20082020.txt"), delim = "\t")
  
  All_Euro_ID <- read_delim(here("Output/Temp/merged_PRS_UKBB.fam"), delim = " ", col_names = F)
  
  CoreExPheno_Euro <- CoreExPheno %>% 
    filter(Geno.BroadAncestry == "European",
           Geno.SampleID %in% All_Euro_ID$X2,
           General.Use != "No",
           !(Pheno.Study %in% c("Auckland Controls", "Australian Controls", "ESR", "Rheumatoid Arthritis")))
  
  All_CoreEx_ID <- read_delim("/Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.fam", delim = " ", col_names = F)
  
  CoreExPheno_Poly <- CoreExPheno %>% 
    filter(Geno.BroadAncestry == "Oceanian",
           Geno.SampleID %in% All_CoreEx_ID$X2,
           General.Use != "No",
           !(Pheno.Study %in% c("ESR", "Pacific Trust")),
           !is.na(Pheno.GoutSummary))
  
  rm(CoreExPheno, All_CoreEx_ID, All_Euro_ID)
  
  all_coreex_ids <- rbind(CoreExPheno_Euro, CoreExPheno_Poly) %>% 
    select(Geno.FamilyID, Geno.SampleID)
  
  #write_delim(all_coreex_ids, delim = "\t", file = here("Output/Temp/all_coreex_ids.txt"), col_names = F)
  
  #system(paste0("source ~/.bashrc; plink1.9b4.9 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --keep ", here("Output/Temp/all_coreex_ids.txt"), " --extract range ", here("Output/Temp/snplist_plink2.txt"), " --geno 0.05 --make-bed --out ", here("Output/Temp/inCoreExGeno")))
  
  geno <- read_delim(here("Output/Temp", "inCoreExGeno.bim"), delim = "\t", col_names = FALSE) %>% 
    mutate(CHR_BP = paste0(X1, "_", X4))
  
  biallelic_sumstat_final_poly <- biallelic_sumstat_final %>% 
    mutate(CHR_BP = paste0(CHR, "_", BP)) %>% 
    filter(CHR_BP %in% geno$CHR_BP)
  
  rm(geno, snplist_plink2)
  
  save(biallelic_sumstat_final, file = here("Output/Temp", "biallelic_sumstat_final.RData"))
  
  save(biallelic_sumstat_final_poly, file = here("Output/Temp", "biallelic_sumstat_final_poly.RData"))
}
  

# Dealing with multi-allelic SNPs (leave for later)
if(file.exists(here("Output/Temp/multi_allelic_for_later.RData"))) {
  #load(here("Output/Temp/multi_allelic_for_later.RData"))
} else {
  tmp <- biallelic_sumstat_final %>% 
    mutate(C_B = paste0(CHR, "_", BP))
  
  sumstat5_multi <- sumstat5 %>% 
    filter(!(C_B %in% tmp$C_B)) # 38,991 rows contain multi-allelic sites
  
  out <- tibble()
  for(i in 1:22) {
    assign(paste0("chr", i, "_snps"), read_delim(here("Output/Temp", paste0("ukb_maf_info_chr", i, ".txt")), delim = "\t", col_names = FALSE) %>% mutate(CHR = i))
    out <- rbind(out, get(paste0("chr", i, "_snps")))
    rm(list = paste0("chr", i, "_snps"), i)
  }
  
  colnames(out) <- c("SNP1", "SNP2", "BP", "Allele1", "Allele2", "MAF", "Minor_Allele", "INFO", "CHR")
  
  out <- out %>% 
    mutate(C_B = paste0(CHR, "_", BP)) %>% 
    filter(str_length(Allele1) == 1,
           str_length(Allele2) == 1,
           MAF > 0.01,
           MAF < 0.99,
           INFO > 0.3)
  
  out2 <- out %>% 
    filter(C_B %in% sumstat5_multi$C_B) %>% 
    arrange(CHR, BP) # 26,235 of the 38,991 chr/bp locations match
  
  sumstat5_multi <- sumstat5_multi %>% 
    left_join(out2, by = c("CHR", "BP"))
  
  # tmp <- sumstat5_multi %>% 
  #   unique() # all unique
  
  # tmp <- sumstat5_multi %>% 
  #   filter(duplicated(C_B.x))
  # 
  # test <- sumstat5_multi %>% 
  #   filter(C_B.x %in% tmp$C_B.x) # 27,909 rows contain duplicated C_B.x
  
  save(sumstat5_multi, file = here("Output/Temp", "multi_allelic_for_later.RData"))
}

To get from these cleaned up summary statistics to the final list of SNPs for the Imputed PRS, I did the following:

  1. I filtered out all SNPs with P-values greater than 5e-8.

  2. I took each lead SNP within a 1 Mb window and used these to define 23 crude loci.

  3. This list of lead SNPs was further filtered to only include one lead SNP per full locus.

    • The boundaries of these “full loci” were defined based on two consecutive genome-wide significant SNPs being more than 500 kb apart.
  4. Next, SNPs in the UK Biobank BGEN files were extracted if they fit within the boundaries of these “full loci”.

  5. Conditional GWAS were run at each locus, conditioning on the lead SNP.

  6. If there was a significant SNP (P < 5e-8) remaining after conditioning, the original lead SNP and the new lead SNP were used for a subsequent conditional GWAS at this locus.

  7. This was repeated until no more significant SNPs (P < 5e-8) remained at each locus.

  8. Locus zooms were plotted for each locus, using both the unconditioned and conditioned GWAS results.

  9. Finally, the resulting list of 27 lead SNPs were saved in a single file ready for conversion to a PRS.

# Defining one SNP per locus ---------------------------------------------------------------------------
# First, filter out P < 5e-8 SNPs and arrange by P
if(file.exists(here("Output/Temp/biallelic_signif.RData"))) {
  load(here("Output/Temp/biallelic_signif.RData"))
} else {
  biallelic_signif <- biallelic_sumstat_final %>% 
    filter(P <= 5e-8) %>% 
    arrange(P)
  
  save(biallelic_signif, file = here("Output/Temp/biallelic_signif.RData"))
}

# Grouping into loci +- 500 kb of top SNPs
gout_top <- biallelic_signif %>% 
  slice(1)
gout2 <- biallelic_signif %>% 
  filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))

while(nrow(gout2) > 0) {
  tmp <- gout2 %>% 
    slice(1)
  gout_top <- rbind(tmp, gout_top)
  gout2 <- gout2 %>% 
    filter(!(CHR == gout_top$CHR[1] & BP %in% ((gout_top$BP[1] - 500000):(gout_top$BP[1] + 500000))))
} 

gout_top <- gout_top %>% 
  arrange(CHR, BP)


# Finding regions of loci
biallelic_signif <- biallelic_signif %>% 
  arrange(CHR, BP)

out <- NA
for(i in 2:nrow(biallelic_signif)) {
  if(biallelic_signif$CHR[i] == biallelic_signif$CHR[i - 1]){
    out[i] <- biallelic_signif$BP[i] - biallelic_signif$BP[i - 1]
  } else {
    out[i] <- NA
  }
}

tmp <- biallelic_signif %>% 
  mutate(Diff = out,
         Diff2 = case_when(Diff < 500000 ~ Diff))

out <- biallelic_signif %>% slice(1)
for(i in 2:nrow(biallelic_signif)) {
  if(is.na(tmp$Diff2[i])){
    out <- rbind(out, biallelic_signif %>% slice(i - 1), biallelic_signif %>% slice(i))
  }
}
out <- rbind(out, biallelic_signif %>% slice(nrow(biallelic_signif)))

# Extracting regions
bgen_ranges <- out %>% select(CHR, BP)

tmp1 <- bgen_ranges %>% slice(seq(1, nrow(bgen_ranges), by = 2)) %>% rename(BP1 = BP)

tmp2 <- bgen_ranges %>% slice(seq(2, nrow(bgen_ranges), by = 2)) %>% rename(CHR.x = CHR, BP2 = BP)

bgen_ranges <- tmp1 %>% 
  cbind(tmp2) %>% 
  mutate(BP1 = BP1 - 50000,
         BP2 = BP2 + 50000) %>% 
  select(-CHR.x)

bgen_range1 <- bgen_ranges %>% 
  filter(CHR < 10) %>% 
  mutate(BGEN = paste0("0", CHR, ":", BP1, "-", BP2))

bgen_range2 <- bgen_ranges %>% 
  filter(CHR > 9) %>% 
  mutate(BGEN = paste0(CHR, ":", BP1, "-", BP2))

bgen_ranges <- rbind(bgen_range1, bgen_range2) %>% 
  arrange(CHR, BP1)

tmp <- bgen_ranges %>% 
  select(BGEN)

rm(bgen_range1, bgen_range2, tmp1, tmp2, out, gout2, i)
  
#write_delim(tmp, file = here("Output/Temp", "bgen_range.txt"), delim = "\n", col_names = F)

# Extracting all SNPs from biallelic sumstat that fit within boundaries of loci
if(file.exists(here("Output/Temp/biallelic_loci.RData"))) {
  load(here("Output/Temp/biallelic_loci.RData"))
} else {
  biallelic_loci <- tibble()
  for(i in 1:nrow(bgen_ranges)){
    tmp <- biallelic_sumstat_final %>% 
    filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i]))
    biallelic_loci <- rbind(biallelic_loci, tmp)
  }
  biallelic_loci <- biallelic_loci %>% 
    mutate(SNP_ID2 = paste0(CHR, "_", BP, "_", Alternate_Allele, "_", Effect_Allele))
  #save(biallelic_loci, file = here("Output/Temp/biallelic_loci.RData"))
}

tmp <- biallelic_loci %>% 
  mutate(BP2 = BP) %>% 
  select(CHR, BP, BP2, SNP)

#write_delim(tmp, file = here("Output/Temp", "biallelic_loci_snps.txt"), delim = "\t", col_names = F)

out <- c()
for(i in 1:nrow(bgen_ranges)){
  tmp <- gout_top %>% 
    filter(CHR == bgen_ranges$CHR[i] & between(BP, bgen_ranges$BP1[i], bgen_ranges$BP2[i])) %>% 
    arrange(P) %>% 
    slice(1)
  out <- rbind(out, tmp)
}

gout_top <- out %>% 
  cbind(bgen_ranges %>% select(-CHR))

rm(tmp, bgen_ranges, out, i)



# Extracting all SNPs at loci from bgen files and converting to plink format -------------------------------
#system(paste0('source ~/.bashrc; parallel "bgenix -g /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/ukb_imp_chr{}_v3.bgen -vcf -incl-range ', here("Output/Temp", "bgen_range.txt"), ' | bcftools reheader -h /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/new_header.txt | bcftools annotate --rename-chrs /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/rename_contigs.txt | bgzip -c > ', here("Output/Temp", "chr"), '{}_forclumping.vcf.gz" ::: ', paste(unique(gout_top$CHR), collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b4.9 --vcf ', here("Output/Temp/"), 'chr{}_forclumping.vcf.gz --extract range ', here("Output/Temp/biallelic_loci_snps.txt"), ' --pheno ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --pheno-name plink_goutaff --update-sex ', here("Data/GWAS", "gout_gwas_keep_ids_w_sex.txt"), ' --geno 0.1 --maf 0.01 --hwe 0.000001 --make-bed --out ', here("Output/Temp/"), 'chr{}_tmp" ::: ', paste(unique(gout_top$CHR), collapse = " ")))


# Reading the bim files into R and converting their identifiers to just the rsid
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "_tmp.bim")]

for(i in file_names){
  assign(i, read_delim(paste0(here("Output/Temp/"), i), delim = "\t", col_names = F))
  assign(i, get(i) %>% left_join(biallelic_loci, (by = c("X1" = "CHR", "X4" = "BP"))) %>% mutate(SNP_clean = case_when(is.na(RSID) ~ SNP_ID2, TRUE ~ RSID)))
  # assign(paste0(i, "_notequal"), get(i) %>% filter(X2 != SNP)) # all identical
  assign(paste0("new_", i), get(i) %>% select(X1, SNP_clean, X3:X6))
  #write_delim(get(paste0("new_", i)), file = paste0(here("Output/Temp/"), i), delim = "\t", col_names = F)
}

rm(list = ls()[str_detect(ls(), ".bim")], i, file_names)



# Running the conditional GWAS ----------------------------------------
# Split up the plink files to have one locus per file (saves on computational time)
gout_top2 <- gout_top %>% 
  select(CHR, BP1, BP2, RSID)

for(i in 1:nrow(gout_top2)){
  tmp <- gout_top2 %>% slice(i)
  #write_delim(tmp, file = paste0(here("Output/Temp/"), "extractrange_", tmp$RSID, ".txt"), delim = "\t", col_names = F)
}

#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile {1}/Output/Temp/chr{2}_tmp --extract range {1}/Output/Temp/extractrange_{3}.txt --make-bed --out {1}/Output/Temp/{3}" ::: ', paste(rep(here(), nrow(gout_top)), collapse = " "), ' ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "cat {1}/Output/Temp/{2}.bim | cut -f 2 > {1}/Output/Temp/{2}_snps; split -d -n l/10 {1}/Output/Temp/{2}_snps {1}/Output/Temp/{2}_snps_split" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " ")))

# First round of conditioning

#system(paste0('source ~/.bashrc; parallel "echo {2} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition {2} --out {1}/Output/Temp/{2}_split{3}" ::: ', here(), ' ::: ', paste(gout_top$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))

# Reading all the outputs into R
file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+.assoc.logistic"))]

for(i in file_names){
  assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}

tmp <- c()
for(i in 1:nrow(gout_top)){
  tmp3 <- c()
  for(j in 0:9){
    tmp2 <- get(paste0(gout_top$RSID[i], "_split0", j, ".assoc.logistic"))
    tmp3 <- rbind(tmp3, tmp2)
  }
  assign(paste0(gout_top$RSID[i], "_gwas"), tmp3 %>% na.omit())
  tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
  tmp <- rbind(tmp, tmp3)
}
tmp <- tmp %>% 
  rename(new_lead = SNP, new_p = P)
gout_top2 <- gout_top %>% 
  cbind(tmp)

gout_top_resid <- gout_top2 %>%
  filter(new_p < 5e-8)

rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)


# Second round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}'"), ' > {1}/Output/Temp/{2}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(gout_top_resid$new_lead, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_2 --out {1}/Output/Temp/{2}_split{3}_2" ::: ', here(), ' ::: ', paste(gout_top_resid$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))

file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_2.assoc.logistic"))]

for(i in file_names){
  assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}

tmp <- c()
for(i in 1:nrow(gout_top_resid)){
  tmp3 <- c()
  for(j in 0:9){
    tmp2 <- get(paste0(gout_top_resid$RSID[i], "_split0", j, "_2.assoc.logistic"))
    tmp3 <- rbind(tmp3, tmp2)
  }
  assign(paste0(gout_top_resid$RSID[i], "_gwas2"), tmp3 %>% na.omit())
  tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
  tmp <- rbind(tmp, tmp3)
}

tmp <- tmp %>% 
  rename(new_lead2 = SNP, new_p2 = P)
gout_top3 <- gout_top_resid %>% 
  cbind(tmp)

gout_top_resid2 <- gout_top3 %>%
  filter(new_p2 < 5e-8)

rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)

# Third round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}'"), ' > {1}/Output/Temp/{2}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid2$new_lead2, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_3 --out {1}/Output/Temp/{2}_split{3}_3" ::: ', here(), ' ::: ', paste(gout_top_resid2$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))

file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_3.assoc.logistic"))]

for(i in file_names){
  assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}

tmp <- c()
for(i in 1:nrow(gout_top_resid2)){
  tmp3 <- c()
  for(j in 0:9){
    tmp2 <- get(paste0(gout_top_resid2$RSID[i], "_split0", j, "_3.assoc.logistic"))
    tmp3 <- rbind(tmp3, tmp2)
  }
  assign(paste0(gout_top_resid2$RSID[i], "_gwas3"), tmp3 %>% na.omit())
  tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
  tmp <- rbind(tmp, tmp3)
}

tmp <- tmp %>% 
  rename(new_lead3 = SNP, new_p3 = P)
gout_top4 <- gout_top_resid2 %>% 
  cbind(tmp)

gout_top_resid3 <- gout_top4 %>%
  filter(new_p3 < 5e-8)

rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)


# Fourth round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead3, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}\n{5}'"), ' > {1}/Output/Temp/{2}_4" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead2, collapse = " "), ' ::: ', paste(gout_top_resid3$new_lead3, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_4 --out {1}/Output/Temp/{2}_split{3}_4" ::: ', here(), ' ::: ', paste(gout_top_resid3$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))

file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_4.assoc.logistic"))]

for(i in file_names){
  assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}

tmp <- c()
for(i in 1:nrow(gout_top_resid3)){
  tmp3 <- c()
  for(j in 0:9){
    tmp2 <- get(paste0(gout_top_resid3$RSID[i], "_split0", j, "_4.assoc.logistic"))
    tmp3 <- rbind(tmp3, tmp2)
  }
  assign(paste0(gout_top_resid3$RSID[i], "_gwas4"), tmp3 %>% na.omit())
  tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
  tmp <- rbind(tmp, tmp3)
}

tmp <- tmp %>% 
  rename(new_lead4 = SNP, new_p4 = P)
gout_top5 <- gout_top_resid3 %>% 
  cbind(tmp)

gout_top_resid4 <- gout_top5 %>%
  filter(new_p4 < 5e-8)

rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)


# Fifth round of conditioning
#system(paste0('source ~/.bashrc; parallel "echo {4} >> {1}/Output/Temp/{2}_snps_split{3}" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead4, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel --xapply "echo $', paste0("'{2}\n{3}\n{4}\n{5}\n{6}'"), ' > {1}/Output/Temp/{2}_5" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead2, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead3, collapse = " "), ' ::: ', paste(gout_top_resid4$new_lead4, collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile {1}/Output/Temp/{2} --extract {1}/Output/Temp/{2}_snps_split{3} --logistic sex --ci 0.95 --covar {1}/Data/GWAS/gout_gwas_covar.covar --covar-name Age,pc1-pc40 --condition-list {1}/Output/Temp/{2}_5 --out {1}/Output/Temp/{2}_split{3}_5" ::: ', here(), ' ::: ', paste(gout_top_resid4$RSID, collapse = " "), ' ::: ', paste(paste0(0, 0:9), collapse = " ")))

file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), regex("rs[0-9]+_split[0-9]+_5.assoc.logistic"))]

for(i in file_names){
  assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}

tmp <- c()
for(i in 1:nrow(gout_top_resid4)){
  tmp3 <- c()
  for(j in 0:9){
    tmp2 <- get(paste0(gout_top_resid4$RSID[i], "_split0", j, "_5.assoc.logistic"))
    tmp3 <- rbind(tmp3, tmp2)
  }
  assign(paste0(gout_top_resid4$RSID[i], "_gwas5"), tmp3 %>% na.omit())
  tmp3 <- tmp3 %>% select(SNP, P) %>% arrange(P) %>% slice(1)
  tmp <- rbind(tmp, tmp3)
}

tmp <- tmp %>% 
  rename(new_lead5 = SNP, new_p5 = P)
gout_top6 <- gout_top_resid4 %>% 
  cbind(tmp)

gout_top_resid5 <- gout_top6 %>%
  filter(new_p5 < 5e-8)

rm(list = ls()[str_detect(ls(), ".assoc")], i, tmp, tmp2, file_names, tmp3, j)
  


# Locus zooms ---------------------------------------
# Loading in code and gene list
source(here("Script/Functions/locus_zoom.R"))

UCSC_GRCh37_Genes_UniqueList.txt <- as.data.frame(read_delim(here("Data/GWAS/UCSC_GRCh37_Genes_UniqueList.txt"), delim = "\t"))


# Plotting locus zooms of original GWAS

# Calculating LD
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top$CHR, collapse = " "), ' ::: ', paste(gout_top$RSID, collapse = " ")))

# Reading the LD reports back into R
for(i in 1:nrow(gout_top)){
  assign(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld.ld")))
}

# Making full list of SNPs for labelling
first_round <- gout_top_resid %>% 
  select(new_lead) %>% 
  rename(RSID = new_lead)

second_round <- gout_top_resid2 %>% 
  select(new_lead2) %>% 
  rename(RSID = new_lead2)

third_round <- gout_top_resid3 %>% 
  select(new_lead3) %>% 
  rename(RSID = new_lead3)

fourth_round <- gout_top_resid4 %>% 
  select(new_lead4) %>% 
  rename(RSID = new_lead4)

gout_top_full <- gout_top %>% 
  select(RSID) %>% 
  rbind(first_round, second_round, third_round, fourth_round) %>% 
  left_join(biallelic_loci, by = "RSID") %>% 
  arrange(CHR, BP)

# Plotting the locus zooms
for(i in 1:nrow(gout_top)){
  locus.zoom(data = biallelic_loci %>% mutate(SNP = RSID) %>% filter(!is.na(SNP), CHR == gout_top$CHR[i] & between(BP, gout_top$BP1[i], gout_top$BP2[i])),
             region = c(gout_top$CHR[i], gout_top$BP1[i], gout_top$BP2[i]),
             offset_bp = 0,
             ld.file = get(paste0("chr", gout_top$CHR[i], "_", gout_top$RSID[i], "_ld")),
             genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
             plot.title = paste0("Unconditioned ", gout_top$RSID[i], " Locus Zoom"),
             file.name = paste0(here("Output/Plots/"), "Chr", gout_top$CHR[i], "_", gout_top$BP1[i], "_", gout_top$BP2[i], "_", gout_top$RSID[i], "_unconditioned", ".jpg"),
             secondary.snp = gout_top_full$RSID,
             secondary.label = TRUE)
}


# Plotting locus zooms of first round of conditioning

# Remaking LD based on ALL new lead SNPs
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top2$CHR, collapse = " "), ' ::: ', paste(gout_top2$new_lead, collapse = " ")))

# Reading the LD reports back into R
for(i in 1:nrow(gout_top2)){
  assign(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld.ld")))
}

# Plotting locus zooms
for(i in 1:nrow(gout_top2)){
  locus.zoom(data = get(paste0(gout_top2$RSID[i], "_gwas")),
             region = c(gout_top$CHR[i], gout_top2$BP1[i], gout_top2$BP2[i]),
             offset_bp = 0,
             ld.file = get(paste0("chr", gout_top2$CHR[i], "_", gout_top2$new_lead[i], "_ld")),
             genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
             plot.title = paste0("Conditioned on ", gout_top2$RSID[i]),
             file.name = paste0(here("Output/Plots/"), "Chr", gout_top2$CHR[i], "_", gout_top2$BP1[i], "_", gout_top2$BP2[i], "_", gout_top2$RSID[i], "_condition_", gout_top2$RSID[i], ".jpg"),
             secondary.snp = gout_top_full$RSID,
             secondary.label = TRUE)
}


# Plotting locus zooms of second round of conditioning

# Remaking LD based on new lead SNPs
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top3$CHR, collapse = " "), ' ::: ', paste(gout_top3$new_lead2, collapse = " ")))

# Reading the LD reports back into R
for(i in 1:nrow(gout_top3)){
  assign(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld.ld")))
}

# Plotting locus zooms
for(i in 1:nrow(gout_top3)){
  locus.zoom(data = get(paste0(gout_top3$RSID[i], "_gwas2")),
             region = c(gout_top3$CHR[i], gout_top3$BP1[i], gout_top3$BP2[i]),
             offset_bp = 0,
             ld.file = get(paste0("chr", gout_top3$CHR[i], "_", gout_top3$new_lead2[i], "_ld")),
             genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
             plot.title = paste0("Conditioned on ", gout_top3$RSID[i], " and ", gout_top3$new_lead[i]),
             file.name = paste0(here("Output/Plots/"), "Chr", gout_top3$CHR[i], "_", gout_top3$BP1[i], "_", gout_top3$BP2[i], "_", gout_top3$RSID[i], "_condition_", gout_top3$RSID[i], "and", gout_top3$new_lead[i], ".jpg"),
             secondary.snp = gout_top_full$RSID,
             secondary.label = TRUE)
}


# Plotting locus zooms of third round of conditioning

# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top4$CHR, collapse = " "), ' ::: ', paste(gout_top4$new_lead3, collapse = " ")))

# Reading the LD reports back into R
for(i in 1:nrow(gout_top4)){
  assign(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld.ld")))
}

# Plotting locus zooms
for(i in 1:nrow(gout_top4)){
  locus.zoom(data = get(paste0(gout_top4$RSID[i], "_gwas3")),
             region = c(gout_top4$CHR[i], gout_top4$BP1[i], gout_top4$BP2[i]),
             offset_bp = 0,
             ld.file = get(paste0("chr", gout_top4$CHR[i], "_", gout_top4$new_lead3[i], "_ld")),
             genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
             plot.title = paste0("Conditioned on ", gout_top4$RSID[i], " and ", gout_top4$new_lead[i], " and ", gout_top4$new_lead2[i]),
             file.name = paste0(here("Output/Plots/"), "Chr", gout_top4$CHR[i], "_", gout_top4$BP1[i], "_", gout_top4$BP2[i], "_", gout_top4$RSID[i], "_condition_", gout_top4$RSID[i], "and", gout_top4$new_lead[i], "and", gout_top4$new_lead2[i], ".jpg"),
             secondary.snp = gout_top_full$RSID,
             secondary.label = TRUE)
}


# Plotting locus zooms of fourth round of conditioning

# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top5$CHR, collapse = " "), ' ::: ', paste(gout_top5$new_lead4, collapse = " ")))

# Next I need to read the ld reports back into R

for(i in 1:nrow(gout_top5)){
  assign(paste0("chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld.ld")))
}

for(i in 1:nrow(gout_top5)){
  locus.zoom(data = get(paste0(gout_top5$RSID[i], "_gwas4")),
             region = c(gout_top5$CHR[i], gout_top5$BP1[i], gout_top5$BP2[i]),
             offset_bp = 0,
             ld.file = get(paste0("chr", gout_top5$CHR[i], "_", gout_top5$new_lead4[i], "_ld")),
             genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
             plot.title = paste0("Conditioned on ", gout_top5$RSID[i], " and ", gout_top5$new_lead[i], " and ", gout_top5$new_lead2[i], " and ", gout_top5$new_lead3[i]),
             file.name = paste0(here("Output/Plots/"), "Chr", gout_top5$CHR[i], "_", gout_top5$BP1[i], "_", gout_top5$BP2[i], "_", gout_top5$RSID[i], "_condition_", gout_top5$RSID[i], "and", gout_top5$new_lead[i], "and", gout_top5$new_lead2[i], "and", gout_top5$new_lead3[i], ".jpg"),
             secondary.snp = gout_top_full$RSID,
             secondary.label = TRUE)
}

# Plotting locus zooms of fifth round of conditioning

# Remaking LD based on new lead SNP
#system(paste0('source ~/.bashrc; parallel --xapply "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --r2 inter-chr --ld-snp {2} --ld-window-r2 0 --out ', here("Output/Temp/"), 'chr{1}_{2}_ld" ::: ', paste(gout_top6$CHR, collapse = " "), ' ::: ', paste(gout_top6$new_lead5, collapse = " ")))

# Next I need to read the ld reports back into R

for(i in 1:nrow(gout_top6)){
  assign(paste0("chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld"), read_table(paste0(here("Output/Temp/"), "chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld.ld")))
}

for(i in 1:nrow(gout_top6)){
  locus.zoom(data = get(paste0(gout_top6$RSID[i], "_gwas5")),
             region = c(gout_top6$CHR[i], gout_top6$BP1[i], gout_top6$BP2[i]),
             offset_bp = 0,
             ld.file = get(paste0("chr", gout_top6$CHR[i], "_", gout_top6$new_lead4[i], "_ld")),
             genes.data = UCSC_GRCh37_Genes_UniqueList.txt,
             plot.title = paste0("Conditioned on ", gout_top6$RSID[i], " and ", gout_top6$new_lead[i], " and ", gout_top6$new_lead2[i], " and ", gout_top6$new_lead3[i]),
             file.name = paste0(here("Output/Plots/"), "Chr", gout_top6$CHR[i], "_", gout_top6$BP1[i], "_", gout_top6$BP2[i], "_", gout_top6$RSID[i], "_condition_", gout_top6$RSID[i], "and", gout_top6$new_lead[i], "and", gout_top6$new_lead2[i], "and", gout_top6$new_lead3[i], ".jpg"),
             secondary.snp = gout_top_full$RSID,
             secondary.label = TRUE)
}



# Combining all GWAS results together into final list ----------------------------------------------
regions <- gout_top %>%
  select(CHR, BP1, BP2)

out <- c()
for(i in 1:nrow(regions)){
  tmp <- gout_top_full %>% 
    filter(CHR == regions$CHR[i] & between(BP, regions$BP1[i], regions$BP2[i])) %>% 
    mutate(BP1 = regions$BP1[i], 
           BP2 = regions$BP2[i])
  out <- rbind(out, tmp)
}

gout_top_full <- out

# For each of the loci with multiple SNPs, I need to test the association of each SNP after adjusting for all others at the locus
# So first lets pull out all SNPs from those loci
multi_snps <- gout_top_full %>% 
  filter(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1]))

tmp <- multi_snps %>% select(RSID)

#write_delim(tmp, file = here("Output/Temp/snps_to_extract.txt"), delim = "\n", col_names = F)

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --extract ', here("Output/Temp/snps_to_extract.txt"), ' --make-bed --out ', here("Output/Temp/"), 'chr{1}_test" ::: ', paste(unique(multi_snps$CHR), collapse = " ")))

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr4_test --bmerge ', here("Output/Temp/"), 'chr11_test --make-bed --out ', here("Output/Temp/"), 'merged_test'))

for(i in 6:9){
  system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition ', multi_snps$RSID[i], ' --out ', here("Output/Temp/"), 'final_gwas_', multi_snps$RSID[i]))
}

for(i in 1:5){
  write_delim(multi_snps %>% slice(1:5) %>% select(RSID) %>% filter(RSID != multi_snps$RSID[i]), file = paste0(here("Output/Temp/"), "conditionlist_", multi_snps$RSID[i], ".txt"), delim = "\n", col_names = F)
  
  system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition-list ', here("Output/Temp/"), 'conditionlist_', multi_snps$RSID[i], '.txt --out ', here("Output/Temp/"), 'final_gwas_', multi_snps$RSID[i]))
}

file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "final_gwas_.+logistic")]

for(i in file_names){
  assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}

tmp <- c()
for(i in 1:5){
  tmp2 <- get(paste0("final_gwas_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(i)
  tmp <- rbind(tmp, tmp2)
}

tmp2 <- c()
for(i in 6:9){
  tmp3 <- get(paste0("final_gwas_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(-(1:5))
  tmp2 <- rbind(tmp2, tmp3)
}

tmp2 <- tmp2 %>% slice(5, 2, 15, 12)

multi_snps2 <- rbind(tmp, tmp2)

tmp2 <- multi_snps2 %>% 
  select(CHR:BP, OR:U95, P)

multi_snps3 <- left_join(multi_snps, tmp2, by = c("CHR", "BP"))

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_test --r2 inter-chr --ld-window-r2 0 --out ', here("Output/Temp/"), 'merged_ld'))

merged_ld <- read_table(paste0(here("Output/Temp/"), "merged_ld.ld"))

# test multicollinearity in SLC2A9 model in next Rmd

single_snps <- gout_top_full %>% 
  filter(!(BP1 %in% names(table(gout_top_full$BP1)[table(gout_top_full$BP1) > 1])))

single_snps2 <- single_snps %>% 
  select(CHR, RSID, BP:Alternate_Allele, OR:U95, P, EAF, INFO, BP1, BP2)

multi_snps4 <- multi_snps3 %>% 
  select(CHR, RSID, BP:Alternate_Allele, OR.y:U95.y, P.y, OR.x:U95.x, P.x, EAF, INFO, BP1, BP2) %>% 
  rename(OR = OR.y,
         SE = SE.y, 
         L95 = L95.y,
         U95 = U95.y,
         P = P.y,
         OR_old = OR.x,
         SE_old = SE.x, 
         L95_old = L95.x,
         U95_old = U95.x,
         P_old = P.x)

gout_top_final <- full_join(multi_snps4, single_snps2) %>% 
  arrange(CHR, BP)

# Flipping allele order + OR etc so effect allele is always the gout risk allele + labelling based on locus zooms
smallOR <- gout_top_final %>% 
  filter(OR < 1) %>% 
  mutate(OR = as.numeric(signif(1/OR, digits = 4)),
         tmp_L = as.numeric(signif(1/L95, digits = 4)),
         tmp_U = as.numeric(signif(1/U95, digits = 4)),
         U95 = tmp_L,
         L95 = tmp_U,
         OR_old = as.numeric(round(1/OR_old, digits = 3)),
         tmp_L_old = as.numeric(round(1/L95_old, digits = 3)),
         tmp_U_old = as.numeric(round(1/U95_old, digits = 3)),
         U95_old = tmp_L_old,
         L95_old = tmp_U_old,
         EAF = 1 - EAF) %>% 
  rename(allele2 = Effect_Allele,
         allele1 = Alternate_Allele) %>% 
  rename(Alternate_Allele = allele2,
         Effect_Allele = allele1) %>% 
  select(CHR:BP, Effect_Allele, Alternate_Allele, OR:BP2)
bigOR <- gout_top_final %>% 
  filter(OR > 1)
gout_top_final <- full_join(smallOR, bigOR) %>% 
  arrange(CHR, BP) %>% 
  mutate(Locus_Name = c("ARID1A", "PDZK1", "TRIM46", "GCKR", "SFMBT1", "SLC2A9", "SLC2A9", "SLC2A9", "SLC2A9", "SLC2A9", "ABCG2", "ABCG2", "ADH1B", "TMEM171", "RREB1", "SLC17A1", "MLXIPL", "SLC16A9", "ANO3", "SLC22A11", "SLC22A11", "OVOL1", "R3HDM2", "MLXIP", "NFAT5", "INSR", "PNPLA3"))

UKBB_Gene_OR <- gout_top_final
  
save(UKBB_Gene_OR, file = here("Output/UKBB_Gene_OR.RData"))

# Cleaning up
rm(list = ls()[str_detect(ls(), "^chr|^gout_top|^final_|gwas.$")], bigOR, first_round, multi_snps, multi_snps2, multi_snps3, out, regions, second_round, single_snps, smallOR, third_round, tmp, tmp2, tmp3, file_names, file_names2, i)

The Imputed PRS was then modified to form a Genotyped PRS as follows:

  1. Biallelic SNPs from the imputed UK Biobank were filtered to only include those directly genotyped in the CoreExome (~ 250,000).

  2. This list of SNPs was overlapped with the final SNP list for the European analysis to find those SNPs directly genotyped in the CoreExome (3 / 27).

  3. For any SNP that was not directly genotyped, a full LD report was produced with candidate proxy SNPs based on the ~250,000 SNPs above.

  4. The SNP with the highest R-squared for each SNP in the European PRS was extracted, then this list was filtered to only include those SNPs that met P < 5e-8 in the full UK Biobank gout GWAS (15 / 24).

  5. The 3 directly genotyped SNPs were combined with these 15 proxies to produce a single file of 18 lead SNPs for the Genotyped PRS.

# Load in SNPs for inclusion in Polynesian analysis
load(here("Output/Temp/biallelic_sumstat_final_poly.RData"))
load(here("Output/UKBB_Gene_OR.RData"))

# Next I need to test the 27 lead SNPs from the European analysis for LD with this list of 243,985 variants in order to find the best proxies

# To make this more efficient, I should initially filter the 243,985 SNP list based on the locus boundaries for the Euro PRS
biallelic_loci_poly <- tibble()
tmp_loci <- UKBB_Gene_OR %>% 
  select(CHR, BP1, BP2) %>% 
  unique()
for(i in 1:nrow(tmp_loci)){
  tmp <- biallelic_sumstat_final_poly %>% 
  filter(CHR == tmp_loci$CHR[i] & between(BP, tmp_loci$BP1[i], tmp_loci$BP2[i]))
  biallelic_loci_poly <- rbind(biallelic_loci_poly, tmp)
}

out <- c()
for(i in 1:nrow(UKBB_Gene_OR)){
  test <- biallelic_loci_poly %>% 
    filter(CHR == UKBB_Gene_OR$CHR[i] & BP == UKBB_Gene_OR$BP[i])
  out <- rbind(out, test)
}

need_proxies <- UKBB_Gene_OR %>% 
  filter(!(BP %in% out$BP))

#write_delim(need_proxies %>% select(RSID), file = here("Output/Temp/need_proxies.txt"), delim = "\t", col_names = F)

biallelic_loci_poly2 <- tibble()
tmp_loci <- need_proxies %>% 
  select(CHR, BP1, BP2) %>% 
  unique()
for(i in 1:nrow(tmp_loci)){
  tmp <- biallelic_sumstat_final_poly %>% 
  filter(CHR == tmp_loci$CHR[i] & between(BP, tmp_loci$BP1[i], tmp_loci$BP2[i]))
  biallelic_loci_poly2 <- rbind(biallelic_loci_poly2, tmp)
}

tmp <- biallelic_loci_poly2 %>% 
  mutate(BP1 = BP) %>% 
  select(CHR, BP, BP1, RSID)

tmp2 <- need_proxies %>% 
  mutate(BP1 = BP) %>% 
  select(CHR, BP, BP1, RSID)

need_ld <- rbind(tmp, tmp2) %>% 
  arrange(CHR, BP)

# write_delim(need_ld, file = here("Output/Temp/need_ld.txt"), delim = "\t", col_names = F)

# Now I need to make a single UK Biobank plink file with all SNPs above and test LD between the 25 SNPs of interest and all others
# For each SNP, I will then pull out the best proxy and filter the final list for proxies with at least 0.8 R-squared with the lead SNP - either that or based on whether they associate with gout

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/"), 'chr{1}_tmp --extract range ', here("Output/Temp/need_ld.txt"), ' --make-bed --out ', here("Output/Temp/"), 'chr{1}_poly" ::: ', paste(unique(need_ld$CHR), collapse = " ")))

#write_delim(as_tibble(paste0(here("Output/Temp/"), "chr", unique(need_ld$CHR), "_poly")), file = here("Output/Temp/mergefile.txt"), delim = "\n", col_names = F)

#system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile.txt --make-bed --out ', here("Output/Temp/"), 'merged_poly'))

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly --r2 inter-chr --ld-snp-list ', here("Output/Temp/"), 'need_proxies.txt --ld-window-r2 0 --out ', here("Output/Temp/"), 'merged_poly_ld'))

merged_ld <- read_table(paste0(here("Output/Temp/"), "merged_poly_ld.ld"))

out1 <- tibble()
for(i in need_proxies$RSID){
  tmp <- merged_ld %>% filter(SNP_A == i & SNP_B != i) %>% arrange(desc(R2)) %>% slice(1)
  out1 <- rbind(out1, tmp)
}

tmp <- biallelic_loci_poly %>% 
  filter(RSID %in% out1$SNP_B) # 1 SNP was the best proxy for two of the SLC2A9 SNPs

tmp2 <- tmp %>% 
  filter(P < 5e-8) # keeping only those that were significant (rather than keeping based on LD)

out2 <- out1 %>% 
  filter(SNP_B %in% tmp2$RSID)

out <- c()
for(i in 1:nrow(UKBB_Gene_OR)){
  test <- biallelic_loci_poly %>% 
    filter(CHR == UKBB_Gene_OR$CHR[i] & BP == UKBB_Gene_OR$BP[i])
  out <- rbind(out, test)
}

full_list <- rbind(out, tmp2) %>% 
  arrange(CHR, BP) %>% 
  select(CHR, RSID, BP:Alternate_Allele, OR:U95, P, EAF, INFO)

out1_1 <- out1 %>% 
  select(SNP_A, SNP_B, R2)

full_list2 <- full_list %>% 
  left_join(out1_1, by = c("RSID" = "SNP_B")) %>% 
  slice(-5) # removing the duplicated SLC2A9 SNP

tmp <- UKBB_Gene_OR %>% 
  filter(RSID %in% full_list2$SNP_A) %>% 
  select(CHR, BP, RSID, BP1:Locus_Name)

tmp2 <- UKBB_Gene_OR %>% 
  filter(RSID %in% out$RSID) %>% 
  select(CHR, BP, RSID, BP1:Locus_Name)

locus_names <- rbind(tmp, tmp2) %>% 
  arrange(CHR, BP) %>% 
  select(BP1:Locus_Name)

full_list3 <- full_list2 %>% 
  cbind(locus_names) %>% 
  rename(Old_Lead = SNP_A)

# For each of the loci with multiple SNPs, I need to test the association of each SNP after adjusting for all others at the locus
# So first lets pull out all SNPs from those loci
multi_snps <- full_list3 %>% 
  filter(BP1 %in% names(table(full_list3$BP1)[table(full_list3$BP1) > 1]))

tmp <- multi_snps %>% select(RSID)

#write_delim(tmp, file = here("Output/Temp/snps_to_extract_poly.txt"), delim = "\n", col_names = F)

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly --extract ', here("Output/Temp/snps_to_extract_poly.txt"), ' --make-bed --out ', here("Output/Temp/"), 'merged_poly_multi'))

for(i in 4:7){
  system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly_multi --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition ', multi_snps$RSID[i], ' --out ', here("Output/Temp/"), 'final_gwas_poly_', multi_snps$RSID[i]))
}

for(i in 1:3){
  write_delim(multi_snps %>% slice(1:3) %>% select(RSID) %>% filter(RSID != multi_snps$RSID[i]), file = paste0(here("Output/Temp/"), "conditionlist_poly_", multi_snps$RSID[i], ".txt"), delim = "\n", col_names = F)
  
  system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_poly_multi --logistic sex --ci 0.95 --covar ', here("Data/GWAS", "gout_gwas_covar.covar"), ' --covar-name Age,pc1-pc40 --condition-list ', here("Output/Temp/"), 'conditionlist_poly_', multi_snps$RSID[i], '.txt --out ', here("Output/Temp/"), 'final_gwas_poly_', multi_snps$RSID[i]))
}

file_names <- list.files(here("Output/Temp/"))[str_detect(list.files(here("Output/Temp/")), "final_gwas_poly_.+logistic")]

for(i in file_names){
  assign(i, read.table(paste0(here("Output/Temp/"), i), header = T) %>% filter(TEST == "ADD"))
}

tmp <- c()
for(i in 1:3){
  tmp2 <- get(paste0("final_gwas_poly_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(i)
  tmp <- rbind(tmp, tmp2)
}

tmp2 <- c()
for(i in 4:7){
  tmp3 <- get(paste0("final_gwas_poly_", multi_snps$RSID[i], ".assoc.logistic")) %>% slice(-(1:3))
  tmp2 <- rbind(tmp2, tmp3)
}

tmp2 <- tmp2 %>% slice(5, 2, 15, 12)

multi_snps2 <- rbind(tmp, tmp2)

tmp2 <- multi_snps2 %>% 
  select(CHR:BP, OR:U95, P)

multi_snps3 <- left_join(multi_snps, tmp2, by = c("CHR", "BP"))

single_snps <- full_list3 %>% 
  filter(!(BP1 %in% names(table(full_list3$BP1)[table(full_list3$BP1) > 1])))

single_snps2 <- single_snps %>% 
  select(CHR, RSID, BP:Alternate_Allele, OR:U95, P, EAF, INFO, Old_Lead, BP1, BP2, Locus_Name)

multi_snps4 <- multi_snps3 %>% 
  select(CHR, RSID, BP:Alternate_Allele, OR.y:U95.y, P.y, OR.x:U95.x, P.x, EAF, INFO, Old_Lead, BP1, BP2, Locus_Name) %>% 
  rename(OR = OR.y,
         SE = SE.y, 
         L95 = L95.y,
         U95 = U95.y,
         P = P.y,
         OR_old = OR.x,
         SE_old = SE.x, 
         L95_old = L95.x,
         U95_old = U95.x,
         P_old = P.x)

full_list4 <- full_join(multi_snps4, single_snps2) %>% 
  arrange(CHR, BP)

smallOR <- full_list4 %>% 
  filter(OR < 1) %>% 
  mutate(OR = as.numeric(signif(1/OR, digits = 4)),
         tmp_L = as.numeric(signif(1/L95, digits = 4)),
         tmp_U = as.numeric(signif(1/U95, digits = 4)),
         U95 = tmp_L,
         L95 = tmp_U,
         EAF = 1 - EAF) %>% 
  rename(allele2 = Effect_Allele,
         allele1 = Alternate_Allele) %>% 
  rename(Alternate_Allele = allele2,
         Effect_Allele = allele1) %>% 
  select(CHR:BP, Effect_Allele, Alternate_Allele, OR:Locus_Name)
bigOR <- full_list4 %>% 
  filter(OR > 1)
full_list4 <- full_join(smallOR, bigOR) %>% 
  arrange(CHR, BP)

Poly_Gene_OR <- full_list4

save(Poly_Gene_OR, file = here("Output/Poly_Gene_OR.RData"))

# Cleaning up
rm(biallelic_loci_poly, biallelic_loci_poly2, biallelic_sumstat_final_poly, bigOR, full_list, full_list2, full_list3, locus_names, merged_ld, need_ld, need_proxies, out, out1, out1_1, out2, smallOR, test, tmp, tmp_loci, tmp2, i)

Locus-Zooms

All of the locus zooms are plotted below in separate tabs:

file_names <- list.files(here("Output/Plots"), full.names = T)

tmp <- file_names %>% 
  as_tibble() %>% 
  separate(value, sep = "_", into = c(NA, "X2", "BP1", NA, NA, "Cond", "CondSNPs")) %>% 
  rownames_to_column() %>% 
  mutate(Cond1 = case_when(Cond == "unconditioned.jpg" ~ FALSE, TRUE ~ TRUE), 
         BP1 = as.numeric(BP1), 
         rowname = as.numeric(rowname)) %>% 
  separate(X2, sep = "/", into = c(NA, NA, NA, NA, NA, NA, NA, "CHR")) %>%
  mutate(CHR = as.numeric(str_replace(CHR, "Chr", ""))) %>% 
  arrange(CHR, BP1, Cond1, CondSNPs)

tmp1 <- tmp %>% 
  pull(BP1) %>% 
  unique() %>% 
  as_tibble() %>% 
  rename(BP1 = value)

load(here("Output/UKBB_Gene_OR.RData"))

tmp2 <- UKBB_Gene_OR %>% 
  pull(Locus_Name) %>% 
  unique() %>% 
  as_tibble() %>% 
  rename(Locus_Name = value) %>% 
  cbind(tmp1)

tmp3 <- tmp %>% 
  left_join(tmp2, by = "BP1")

tmp4 <- tmp3 %>% 
  mutate(CondSNPs2 = str_remove(CondSNPs, ".jpg")) %>% 
  separate(CondSNPs2, sep = "and", into = c("SNP1", "SNP2", "SNP3", "SNP4")) %>% 
  mutate(SNPs = case_when(is.na(SNP2) ~ SNP1,
                          !is.na(SNP2) & is.na(SNP3) ~ str_c(SNP1, SNP2, sep = " and "),
                          !is.na(SNP3) & is.na(SNP4) ~ str_c(SNP1, SNP2, SNP3, sep = " and "),
                          !is.na(SNP4) ~ str_c(SNP1, SNP2, SNP3, SNP4, sep = " and ")),
         Plot_Name = case_when(!Cond1 ~ paste0(Locus_Name, " (Uncond.)"),
                               Cond1 ~ paste0(Locus_Name, " (Cond. on ", SNPs, ")")))

file_names2 <- file_names[tmp$rowname]

names(file_names2) <- tmp4$Plot_Name

template <- c(
    "#### {{nm}}\n",
    "```{r, echo = FALSE}\n",
    "include_graphics(file_names2['{{nm}}'])\n",
    "```\n",
    "\n"
  )

plots <- lapply(
  tmp4$Plot_Name, 
  function(nm) knit_expand(text = template)
)

ARID1A (Uncond.)

ARID1A (Cond. on rs114165349)

PDZK1 (Uncond.)

PDZK1 (Cond. on rs1967017)

TRIM46 (Uncond.)

TRIM46 (Cond. on rs11264341)

GCKR (Uncond.)

GCKR (Cond. on rs1260326)

SFMBT1 (Uncond.)

SFMBT1 (Cond. on rs2581790)

SLC2A9 (Uncond.)

SLC2A9 (Cond. on rs938558)

SLC2A9 (Cond. on rs938558 and rs10805346)

SLC2A9 (Cond. on rs938558 and rs10805346 and rs11723439)

SLC2A9 (Cond. on rs938558 and rs10805346 and rs11723439 and rs10939671)

ABCG2 (Uncond.)

ABCG2 (Cond. on rs2231142)

ABCG2 (Cond. on rs2231142 and rs28366540)

ADH1B (Uncond.)

ADH1B (Cond. on rs1229984)

TMEM171 (Uncond.)

TMEM171 (Cond. on rs13160226)

RREB1 (Uncond.)

RREB1 (Cond. on rs13191182)

SLC17A1 (Uncond.)

SLC17A1 (Cond. on rs1165154)

MLXIPL (Uncond.)

MLXIPL (Cond. on rs13240065)

SLC16A9 (Uncond.)

SLC16A9 (Cond. on rs1171615)

ANO3 (Uncond.)

ANO3 (Cond. on rs7116077)

SLC22A11 (Uncond.)

SLC22A11 (Cond. on rs7943154)

SLC22A11 (Cond. on rs7943154 and rs11605121)

OVOL1 (Uncond.)

OVOL1 (Cond. on rs948493)

R3HDM2 (Uncond.)

R3HDM2 (Cond. on rs3741414)

MLXIP (Uncond.)

MLXIP (Cond. on rs7484733)

NFAT5 (Uncond.)

NFAT5 (Cond. on rs138993217)

INSR (Uncond.)

INSR (Cond. on rs12973279)

PNPLA3 (Uncond.)

PNPLA3 (Cond. on rs738408)

Summary

In summary, I produced2 lists of SNPs that will be used to create two different PRS’s. The first uses a mixture of imputed and genotyped SNPs, while the second uses only SNPs genotyped on the Human CoreExome v1.0 chip. This will ensure that no imputation was done on Polynesian individuals, which should make the genotypes more reliable, at the expense of not using the best possible variants for the PRS. Three of the 21 total loci had more than one partially independent genome-wide significant signal. These were at SLC2A9 (5 hits), ABCG2 (2 hits), and SLC22A11 (2 hits).


Preparing Phenotype files and making PRS

The purpose of this document is to generate cleaned up phenotype files for each cohort, with the imputed and genotyped polygenic risk scores (PRS’s) included for each. It contains the code for going from the raw phenotype and genotype data (in combination with the SNP lists generated in PRS_GWAS.Rmd) to the finalized data frames for analysis.

The phenotypes of interest are the following (note some may be poorly phenotyped):

  1. Self-reported gout status (i.e. gout vs control)

  2. Self-reported age at collection

  3. Genetically determined sex

  4. Genetic principal components (all 10 global PCs and all 10 Oceanian PCs for Polynesians)

  5. Self-reported age at gout onset

    • Disease duration derived from this and age at collection
  6. Self-reported presence of tophi

  7. Self-reported flare frequency (number of flares in the last year)

  8. Serum urate at collection

  9. Self-reported urate lowering therapy data (at collection)

  10. Self-reported gout prophylaxis data (at collection)

  11. Genetic ancestry data (i.e. European vs West Polynesian vs East Polynesian)

  12. Comorbidity data, including BMI, hypertension, diabetes, heart disease (angina, myocardial infarction, or heart failure), kidney disease (serum creatinine/eGFR), dyslipidemia, stroke - including self report, medication, and metrics such as BMI (for descriptive stats table)

  13. Lifestyle factors - total alcohol consumption, sugar-sweetened drink consumption, smoking status (for descriptive stats table)

  14. Self-reported family history of gout

Exclusion criteria:

  1. Genetic sex to self-report gender mismatch
# Loading PRS SNPs for European and Polynesian analyses
load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Poly_Gene_OR.RData"))


# Extracting SNPs from the CoreExome VCFs using plink ----------------------------------
if(!file.exists(here("Output/Temp/UKBB_SNPs_Plink.txt"))){
  for_plink <- UKBB_Gene_OR %>%
    select(CHR, BP, RSID) %>%
    mutate(BP2 = BP) %>%
    select(CHR, BP, BP2, RSID)
  write_delim(for_plink, file = here("Output/Temp/UKBB_SNPs_Plink.txt"), col_names = F)
  rm(for_plink)
}

if(!file.exists(here("Output/Temp/Poly_SNPs_Plink.txt"))){
  for_plink <- Poly_Gene_OR %>%
    select(CHR, BP, RSID) %>%
    mutate(BP2 = BP) %>%
    select(CHR, BP, BP2, RSID)
  write_delim(for_plink, file = here("Output/Temp/Poly_SNPs_Plink.txt"), col_names = F)
  rm(for_plink)
}

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --vcf /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Imputed_Genotypes/QC1-10_Impute_EUR_only/CZ-MB1.2-QC1.10_EUR_imputed_chr{}.vcf.gz --extract range ', here("Output/Temp/UKBB_SNPs_Plink.txt"), ' --make-bed --out ', here("Output/Temp/UKBB_SNPs_"), 'chr{}" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b6.10 --bfile ', here("Output/Temp/UKBB_SNPs_"), 'chr{} --set-missing-var-ids @:# --make-bed --out ', here("Output/Temp/UKBB_SNPs_"), 'chr{}_2" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))

#write_delim(as_tibble(paste0(here("Output/Temp/"), "UKBB_SNPs_chr", unique(UKBB_Gene_OR$CHR), "_2")), file = here("Output/Temp/mergefile_prs1.txt"), delim = "\n", col_names = F)

#system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile_prs1.txt --make-bed --out ', here("Output/Temp/"), 'merged_PRS_UKBB'))

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_PRS_UKBB --recode --out ', here("Output/Temp/"), 'merged_PRS_UKBB'))

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile /Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted --extract range ', here("Output/Temp/Poly_SNPs_Plink.txt"), ' --make-bed --out ', here("Output/Temp/Poly_SNPs")))

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'Poly_SNPs --recode --out ', here("Output/Temp/"), 'Poly_SNPs'))



# Extracting SNPs from the UK Biobank  and converting to merged plink file -------------------------------
if(!file.exists(here("Output/Temp/PRS_SNPs_BGEN.txt"))){
  tmp <- UKBB_Gene_OR %>%
    select(CHR, BP)

  tmp2 <- Poly_Gene_OR %>%
    select(CHR, BP)

  tmp3 <- rbind(tmp, tmp2) %>%
    arrange(CHR, BP) %>%
    unique()

  bgen_range1 <- tmp3 %>%
    filter(CHR < 10) %>%
    mutate(BGEN = paste0("0", CHR, ":", BP, "-", BP))

  bgen_range2 <- tmp3 %>%
    filter(CHR > 9) %>%
    mutate(BGEN = paste0(CHR, ":", BP, "-", BP))

  bgen_range <- rbind(bgen_range1, bgen_range2) %>%
    arrange(CHR, BP) %>%
    select(BGEN)

  write_delim(bgen_range, file = here("Output/Temp/PRS_SNPs_BGEN.txt"), delim = "\n", col_names = F)

  rm(tmp, tmp2, tmp3, bgen_range1, bgen_range2, bgen_range)
}

#system(paste0('source ~/.bashrc; parallel "bgenix -g /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/ukb_imp_chr{}_v3.bgen -vcf -incl-range ', here("Output/Temp", "PRS_SNPs_BGEN.txt"), ' | bcftools reheader -h /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/new_header.txt | bcftools annotate --rename-chrs /Volumes/scratch/merrimanlab/ukbio/EGAD00010001474/bgen_to_vcf/rename_contigs.txt | bgzip -c > ', here("Output/Temp", "chr"), '{}_forPRS.vcf.gz" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))

#system(paste0('source ~/.bashrc; parallel "plink1.9b4.9 --vcf ', here("Output/Temp/"), 'chr{}_forPRS.vcf.gz --make-bed --out ', here("Output/Temp/"), 'chr{}_PRS" ::: ', paste(unique(UKBB_Gene_OR$CHR), collapse = " ")))

#write_delim(as_tibble(paste0(here("Output/Temp/"), "chr", unique(UKBB_Gene_OR$CHR), "_PRS")), file = here("Output/Temp/mergefile_prs.txt"), delim = "\n", col_names = F)

#system(paste0('source ~/.bashrc; plink1.9b6.10 --merge-list ', here("Output/Temp/"), 'mergefile_prs.txt --make-bed --out ', here("Output/Temp/"), 'merged_PRS'))

#system(paste0('source ~/.bashrc; plink1.9b6.10 --bfile ', here("Output/Temp/"), 'merged_PRS --recode --out ', here("Output/Temp/"), 'merged_PRS'))



# Making phenotype files -----------------------------------------------------------------------------------
if(file.exists(here("Output/Phenotypes.RData"))){
  load(here("Output/Phenotypes.RData"))
  } else {
  # CoreExome QC 1-10 Phenotype file (made by Tanya)
  CoreExPheno <- read_delim(here("Data/Phenotypes/CZ-MB1.2-QC1.10_MergedPhenotypes_20082020.txt"), delim = "\t") %>%
    mutate(across(where(is_character), factor))
  
  # European cohorts = All Ardea (split into each study), EuroGout (includes EireGout), Gout in Aotearoa (combine with AGRIA + DM + RD + NP + LPA => ANZ cohort)
  All_Euro_ID <- read_delim(here("Output/Temp/merged_PRS_UKBB.fam"), delim = " ", col_names = F)
  
  CoreExPheno_Euro <- CoreExPheno %>% 
    filter(Geno.BroadAncestry == "European",
           Geno.SampleID %in% All_Euro_ID$X2,
           General.Use != "No",
           !(Pheno.Study %in% c("Auckland Controls", "Australian Controls", "ESR", "Rheumatoid Arthritis")))
  # 1,146 NU, 455 HU controls (1,601 total) + 4,694 gout = either GP or ACR or self-report
  
  # Polynesian cohorts = AGRIA, All Ardea, DM, EuroGout, Aotearoa (both new + old), LPA, Ngati Porou, RD => all split into East and West
  All_CoreEx_ID <- read_delim(here("Data/Genotypes/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.fam"), delim = " ", col_names = F)
  
  CoreExPheno_Poly <- CoreExPheno %>% 
    filter(Geno.BroadAncestry == "Oceanian",
           Geno.SampleID %in% All_CoreEx_ID$X2,
           General.Use != "No",
           !(Pheno.Study %in% c("ESR", "Pacific Trust")))
  # 1,021 NU, 248 HU controls (1,269 total) + 1,380 gout
  
  CoreExPheno_Final <- full_join(CoreExPheno_Euro, CoreExPheno_Poly) %>% 
    filter(Geno.GeneticSex != "Unknown",
           !is.na(Pheno.GoutSummary)) %>% 
    mutate(Pheno.GoutSummary = factor(case_when(Pheno.GoutSummary == "Gout" ~ "Gout", 
                                                Pheno.GoutSummary %in% c("Control", "HyperU") ~ "Control")),
           across(where(is.factor), factor)) %>% 
    select(Pheno.SampleID:Pheno.UrateTherapy, GenStudio.ChipType, GenStudio.CallRate:Notes)
  
  rm(CoreExPheno, CoreExPheno_Euro, CoreExPheno_Poly, All_CoreEx_ID, All_Euro_ID)
  
  
  # The second CoreEx phenotype file (doesn't add much except TOPHIGOUT and T2DIABETES, but doesn't have cleaned up IDs and so should probably not be used)
  # CoreExPheno2 <- read_delim(here("Data/Phenotypes/CoreExPheno150618.txt"), delim = "\t") %>%
  #   mutate(across(where(is_character), as_factor)) %>% 
  #   filter(SUBJECT %in% CoreExPheno_Final$Pheno.SampleID | ALTID %in% CoreExPheno_Final$Pheno.SampleID | SUBJECT %in% CoreExPheno_Final$Geno.SampleID | ALTID %in% CoreExPheno_Final$Geno.SampleID) %>% 
  #   mutate(GENECHIPSTATUS = factor(GENECHIPSTATUS, labels = c("Not Genotyped", "Genotyped (CoreEx)", "Genotyped (Public)")),
  #          SEX = factor(SEX, labels = c("Male", "Female")),
  #          ETHCLASS = factor(ETHCLASS, labels = c("East Polynesian", "West Polynesian", "European", "Asian", "African", "Other", "White Hispanic", "Mixed East/West Polynesian", "Unknown")),
  #          GPGOUTAFFSTAT = factor(GPGOUTAFFSTAT, labels = c("Unclear", "Control", "Gout (GP diagnosed)", "Gout (drug trial)", "Gout (EuroGout - assumed gout)", "Control (HU)")),
  #          ACRGOUTAFFSTAT = factor(ACRGOUTAFFSTAT, labels = c("Control", "Gout")),
  #          TOPHIGOUT = factor(TOPHIGOUT, labels = c("Control", "Non-tophaceous gout", "Tophaceous gout")),
  #          DUPLICATEFLAG = factor(DUPLICATEFLAG, labels = c("Excluded", "Included")),
  #          T2DIABETES = factor(T2DIABETES, labels = c("No", "Yes", "Borderline")),
  #          QCBATCH = factor(QCBATCH, levels = 1:13, labels = c("1", "2", "3", "4", "5", "6", "7", "8", "A", "M", "9", "10", "11")),
  #          REPORTEDSEX = factor(REPORTEDSEX, labels = c("Unknown", "Male", "Female")),
  #          GENETICSEX = factor(GENETICSEX, labels = c("Male", "Female", "Undetermined")),
  #          SEXDISCREP = factor(SEXDISCREP, labels = c("No discrepancy", "Discrepancy", "Undetermined")),
  #          FINAL_AFFSTAT = factor(FINAL_AFFSTAT, labels = c("Control", "Gout")),
  #          PCAMATCHING = factor(PCAMATCHING, labels = c("No", "Yes", "Ok match"))) %>% 
  #   select(-STUDYCOHORT, -DNABOXED, -(DNABOX:GWASWELL), -(BEADCHIP:CHIPPOSITION), -PCAETHNOTES)
  # 
  # remove <- c()
  # for(i in 1:ncol(CoreExPheno2)){
  #   if(sum(is.na(CoreExPheno2[[i]])) == nrow(CoreExPheno2)) {
  #     remove <- c(remove, i)
  #   }
  # }
  # 
  # CoreExPheno2 <- CoreExPheno2 %>% 
  #   select(-all_of(remove))
  # 
  # rm(remove, i)
  
  
  # Throughout, duration will be derived from onset - age + 1 (because I am deriving it from ages in years, there is up to 1 additional year duration that needs to be accounted for)
  # Going through each cohort alphabetically: AGRIA, Ardea - Ironwood, Ardea - LASSO, DM, EuroGout, Gout in Aotearoa, LPA, Ngati Porou, RD
  
  logicfactor <- function(x) {
    as.logical(factor(x, levels = c(1, 2), labels = c("FALSE", "TRUE")))
  }
  
  # AGRIA
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "AGRIA")
  
  agria_pheno <- read_delim(here("Data/Phenotypes/AGRIAPheno.txt"), delim = "\t") %>% 
    filter(PATIENT %in% tmp$Pheno.SampleID) %>% 
    left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>%
    mutate(across(where(is_character), factor),
           across(c(DIABETES:KIDNEY, ALLOP, PROBEN:STEROID, ANTIINFLAM, COLCHI, GPGOUT:SUSTOPHUS, FAMGOUT, FAMGOUT3, CAUGOUTAFFSTAT:TOPHIGOUT, URATELOWERING, FOOD),
                  logicfactor),
           SEX = factor(SEX, labels = c("Male", "Female")),
           ETHCLASS = factor(ETHCLASS, 
                             levels = c(1:4, 9), 
                             labels = c("East Polynesian", "West Polynesian", "Caucasian", "Other", "Mixed East/West Polynesian")),
           ASPIRATE = factor(ASPIRATE, 
                             levels = 1:3, 
                             labels = c("No", "Yes", "Unknown")),
           DIURETICINDUCED = factor(DIURETICINDUCED, 
                                    levels = 1:3, 
                                    labels = c("No", "Yes", "Unknown")),
           TOPHUS = factor(TOPHUS, 
                           levels = 1:3, 
                           labels = c("No", "Yes", "Undetermined")),
           TOPHUS = case_when(TOPHUS == "Yes" ~ TRUE, TOPHUS == "No" ~ FALSE, TRUE ~ NA),
           ALLOPSIDE = factor(ALLOPSIDE, 
                              levels = 1:3, 
                              labels = c("No", "Yes", "Not taking allopurinol")),
           OFFWORK = factor(OFFWORK, 
                            levels = 1:3, 
                            labels = c("No", "Yes", "Not applicable as not currently working")),
           ALCOTRIG = factor(ALCOTRIG, 
                             levels = 1:4, 
                             labels = c("No", "Yes", "Unsure", "Non drinker")),
           SECONDARYGOUT = factor(SECONDARYGOUT, 
                                  levels = 1:4, 
                                  labels = c("Primary Gout", "Secondary Gout", "Unknown", "Control")),
           SSBCODE = factor(SSBCODE, 
                            levels = 0:5, 
                            labels = c("0/day", "0.1 - 0.99", "1.0 - 1.99", "2.0 - 2.99", "3.0 - 3.99", "4.0 +")),
           FRUITCODE = factor(FRUITCODE, 
                              levels = 0:5, 
                              labels = c("0/day", "0.1 - 0.99", "1.0 - 1.99", "2.0 - 2.99", "3.0 - 3.99", "4.0 +")),
           DIURETICSUMMARY = factor(DIURETICSUMMARY, 
                                    levels = 1:3, 
                                    labels = c("Not taking diuretics", "Taking diuretics", "Maybe taking diuretics")),
           GOUTSUM = factor(GOUTSUM, 
                            levels = 1:3, 
                            labels = c("Control", "Gout", "May have gout/weak gout"))) %>%
    rename(IID = PATIENT,
           GOUT = Pheno.GoutSummary) %>%
    mutate(SEX = Geno.GeneticSex,
           AGESERUM = round(as.duration(interval(DOB, SERUMDATE)) / as.duration(years(1)), 
                            digits = 0),
           AGESCL = round(as.duration(interval(DOB, SCLDATE)) / as.duration(years(1)), 
                          digits = 0),
           AGE1ATK = case_when(is.na(AGEGOUTDOX) ~ round(as.duration(interval(DOB, GOUTDOXDATE)) / as.duration(years(1)), 
                                                         digits = 0),
                               TRUE ~ AGEGOUTDOX),
           DURATION = AGECOL - AGE1ATK + 1,
           TOPHIGOUT = case_when(COMMENT %in% c("No information, neither tophaceous or aspirate proven, Deceased", 
                                                "No information, neither tophaceous or aspirate proven",
                                                "Gout, no tophi",
                                                "No information, neither tophaceous or aspirate proven, lymphoma") ~ FALSE, 
                                 TRUE ~ TOPHUS | GOUTCRITERIAB | SUSTOPHUS | COMMENT %in% c("Tophaceous",
                                                                                            "Urate crystals present, tophaceous", 
                                                                                            "Aspirate proven, tophacous", 
                                                                                            "allopurinol intolerant, febuxostat intolerant, taking benzobromarone.  Urate crystals present, tophacious", 
                                                                                            "Tophaceous gout", 
                                                                                            "Polyarticular tophaceous gout", 
                                                                                            "Chronic tophaceous gout")),
           EROSIONS = NA,
           NUMATK = NA,
           URATE1 = round(URATE * 1000 / 59.48, digits = 1),
           URATEAGE1 = AGESERUM,
           URATE2 = round(SURICACID_SCL * 1000 / 59.48, digits = 1),
           URATEAGE2 = AGESCL,
           URATE = case_when(!is.na(URATE1) ~ URATE1, 
                             TRUE ~ URATE2),
           ULT = case_when(is.na(URATE1) & !is.na(URATE2) ~ NA,
                           TRUE ~ ALLOP | PROBEN | COMMENT %in% c("allopurinol intolerant, febuxostat intolerant, taking benzobromarone.  Urate crystals present, tophacious",
                                                                  "Allopurinol hypersensitivity, Cholchicine induced diarrhoea, Febuxostat 40mg/day", 
                                                                  "febuxostat 40mg/day; liver toxicity with allopurinol")),
           PROPHY = STEROID | ANTIINFLAM | COLCHI,
           HYPERTENSION = case_when(!is.na(HIBP) ~ HIBP, 
                                    TRUE ~ DIURETICINDUCED == "Yes" | DIURETICSUMMARY == "Maybe taking diuretics"),
           TRIGLY = TRIGLY_SCL * 88.57,
           CHOLES = CHOLES_SCL * 38.67,
           STROKE = NA,
           HDL = HDL_SCL * 38.67,
           CREAT = CREAT / 88.42,
           SCREAT = SCREAT / 88.42,
           CREAT2 = rowMeans(across(c(CREAT, SCREAT)), na.rm = TRUE),
           EGFR = case_when(SEX == "Male" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
           KIDNEY = KIDNEY | !is.na(KIDNEY2) | EGFR < 60,
           TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
           CURSMOKE = NA,
           FAMGOUT = FAMGOUT | FAMGOUT3,
           FAMGOUTNUM = as.numeric(FAMGOUT4)) %>%
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  
  # Ardea - Ironwood (CLEAR1, CLEAR2, CRYSTAL, LIGHT)
  # CLEAR1 and CLEAR2 = People who are poor responders to allopurinol (everyone was on ULT at screening yet had > 6mg/dL)
  # CRYSTAL = Two groups, 1 = same as CLEAR trials but also had >= 1 tophus, 2 = very HU people not on ULT with at least one tophus
  # LIGHT = People who cannot take allopurinol, some may be on other ULT at screening
  logicfactor2 <- function(x) {
    as.logical(factor(x, levels = c(0, 1), labels = c("FALSE", "TRUE")))
  }
  
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study %in% c("Ardea: CLEAR1", "Ardea: CLEAR2", "Ardea: CRYSTAL", "Ardea: LIGHT"))
  
  ironwood_pheno <- read_delim(here("Data/Phenotypes/ArdeaPheno.txt"), delim = "\t") %>% 
    filter(SUBJID %in% tmp$Pheno.SampleID) %>% 
    select(SUBJID, AGE, BRTHDTC, BLWEIGHT, BLHEIGHT, BLBMI, TRT01AN, CONSDT, TRTSDT, ANGINA:HYPERTRIGLY, MI, STROKE, AGFIDDT:GFDUR, CRITBFL, PHNM8FL, ULTALLO:ULTOTH, PLACTOTSTDT:PLACTOTENDT, THIALKFL:PROPHTYPN, TOPHIFN:BLAREA, GFNUM:GFNUMGR, DATESCREENING:EGFRSCREENING, CHOLSCREENING, TRIGSCREENING, URATESCREENING, DATENEG7, URATENEG7, EGFRNEG7, DATEBASELINE, URATEBASELINE, EGFRBASELINE, DATEMONTH1, URATEMONTH1, DATEMONTH2, URATEMONTH2, DATEMONTH3, URATEMONTH3, DATEMONTH4, URATEMONTH4, DATEMONTH5, URATEMONTH5, DATEMONTH6, URATEMONTH6, DATEMONTH8, URATEMONTH8, DATEMONTH10, URATEMONTH10, DATEMONTH12, URATEMONTH12, DATEEARLYTERM, URATEEARLYTERM, DATEFOLLOWUP, URATEFOLLOWUP, CURSMOKE:ALCOHOL, TOPHIGOUT:GOUTNOTES) %>% 
    left_join(tmp, by = c("SUBJID" = "Pheno.SampleID")) %>% 
    mutate(across(where(is_character), factor),
           across(c(ANGINA:STROKE, CRITBFL:ULTOTH, THIALKFL:PROPHYFL, TOPHIFN, CURSMOKE:ALCOHOL), 
                  logicfactor2),
           TRT01AN = factor(TRT01AN, 
                            levels = 0:5, 
                            labels = c("Screen Failure", "Group A (Placebo)", "Group B (Lesinurad 200 mg)", "Group C (Lesinurad 400 mg)", "Not Assigned", "Not Treated"))) %>%
    rename(IID = SUBJID,
           GOUT = Pheno.GoutSummary,
           AGECOL = AGE) %>%
    mutate(SEX = Geno.GeneticSex,
           AGE1ATK = round(as.duration(interval(ymd(BRTHDTC, truncated = 2L), AGFIDDT)) / as.duration(years(1)), 
                           digits = 0),
           DURATION = AGECOL - AGE1ATK + 1,
           TOPHIGOUT = TOPHIFN,
           EROSIONS = NA,
           NUMATK = GFNUM,
           URATE = URATESCREENING,
           ULT = ULTALLO | ULTPROB | ULTFEBU | ULTOTH | Pheno.Study %in% c("Ardea: CLEAR1", "Ardea: CLEAR2") | (Pheno.Study == "Ardea: CRYSTAL" & URATE < 8),
           PROPHY = PROPHYFL,
           BMI = BLBMI,
           HEART = HEARTFAILURE | MI | ANGINA,
           KIDNEY = EGFRSCREENING < 60,
           LIPIDS = HYPERCHOLESTEROL | HYPERTRIGLY,
           TOTALALC = NA,
           SUGDRINK = NA,
           FAMGOUT = NA,
           FAMGOUTNUM = NA,
           EGFR2 = case_when(SEX == "Male" ~ 175 * (SCRSCREENING ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (SCRSCREENING ^ -1.154) * (AGECOL ^ -0.203) * 0.742)) %>%
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  
  # Ardea - LASSO (all on ULT the whole time)
  lassopheno1 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoFlare.txt"), delim = "\t")
  lassopheno2 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoLabChem.txt"), delim = "\t")
  lassopheno3 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoMain.txt"), delim = "\t")
  
  tmp <- full_join(lassopheno3, lassopheno2, by = "SUBJID")
  
  lasso_pheno <- full_join(tmp, lassopheno1, by = "SUBJID")
  
  rm(lassopheno1, lassopheno2, lassopheno3)
  
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "Ardea: 401")
  
  lasso_pheno <- lasso_pheno %>% 
    filter(DNAID %in% tmp$Pheno.SampleID) %>% 
    mutate(IID = as.character(DNAID)) %>% 
    select(IID, AGE, BRTHDTC, GFNUM:ULTOSCR, BLBMI:PROPHTYP, AGFIDDT:GFDUR, TOLOCL:GFDTDURL, ANGINA:RENALIMPAIR, MI:STROKE, SCREENGFSTDT, SCREENGFENDT, SCREENGFOUT, SCREENGFSEV, SCREENPAIN, SCREENGFDUR:SCREENPAIN2, SCREENGFSTRSTP, SCREENLBDT, SCREENALT, SCREENCREAT, SCREENGGT, SCREENURATE, BASELINELBDT, BASELINEURATE, BASET1LBDT, BASET1URATE, BASET2LBDT, BASET2URATE, BASET3LBDT, BASET3URATE, MONTH1LBDT, MONTH1URATE, MONTH1T1LBDT, MONTH1T1URATE, MONTH1T2LBDT, MONTH1T2URATE, MONTH2LBDT, MONTH2URATE, MONTH2T1LBDT, MONTH2T1URATE, MONTH3LBDT, MONTH3URATE, MONTH3T1LBDT, MONTH3T1URATE, MONTH3T3LBDT, MONTH3T3URATE, MONTH4LBDT, MONTH4URATE, MONTH4T1LBDT, MONTH4T1URATE, MONTH5LBDT, MONTH5URATE, MONTH6LBDT, MONTH6URATE, UNSCHEDLBDT, UNSCHEDURATE, EARLYTERMLBDT, EARLYTERMURATE) %>% 
    left_join(tmp, by = c("IID" = "Pheno.SampleID")) %>% 
    rename(GOUT = Pheno.GoutSummary,
           AGECOL = AGE,
           NUMATK = GFNUM,
           SEX = Geno.GeneticSex,
           BMI = BLBMI) %>% 
    mutate(across(where(is_character), factor),
           across(c(TOHANDFL:ULTOSCR, BLCDFL, ANGINA:RENALIMPAIR, MI:STROKE), 
                  logicfactor2),
           AGE1ATK = round(as.duration(interval(ymd(BRTHDTC, truncated = 2L), AGFIDDT)) / as.duration(years(1)), 
                           digits = 0),
           DURATION = AGECOL - AGE1ATK + 1,
           TOPHIGOUT = BLTPHFN,
           EROSIONS = NA,
           URATE = SCREENURATE,
           ULT = ALLOSCR | ULTOSCR | SCREENURATE < 8,
           PROPHY = PROPHTYP %in% c("Both", "Colchicine", "NSAID"),
           HEART = ANGINA | MI,
           EGFR = case_when(SEX == "Male" ~ 175 * (SCREENCREAT ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (SCREENCREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
           KIDNEY = RENALIMPAIR | EGFR < 60,
           LIPIDS = HYPERCHOLESTEROL | HYPERTRIGLY,
           TOTALALC = NA,
           SUGDRINK = NA,
           CURSMOKE = NA,
           FAMGOUT = NA,
           FAMGOUTNUM = NA) %>%
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  # Ardea - Other (232 other Ardea study participants (594 and 3170)) - asked ruth if they have separate pheno files - she said we aren't allowed to use them - Tony suggested I just say Tony said why not use them
  # They don't have phenotypes of interest so no point including them
  # tmp <- CoreExPheno_Euro_Gout %>% 
  #   filter(str_detect(Pheno.Study, "Ardea: 3170") | str_detect(Pheno.Study, "Ardea: 594"))
  # 
  # other_ardea_euro_pheno <- read_delim(here("Data/Phenotypes/ArdeaPheno.txt"), delim = "\t") %>% 
  #   filter(SUBJID %in% tmp$Pheno.SampleID)
  # 
  # other_ardea_euro_pheno2 <- read_delim(here("Data/Phenotypes/ArdeaLassoPhenoMain.txt"), delim = "\t") %>% 
  #   filter(DNAID %in% tmp$Pheno.SampleID | SUBJID %in% tmp$Pheno.SampleID)
  # 
  # rm(tmp, other_ardea_euro_pheno, other_ardea_euro_pheno2)
  
  
  # DM
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "Diabetes Mellitus")
  
  dm_pheno <- read_delim(here("Data/Phenotypes/DMPheno.txt"), delim = "\t") %>% 
    filter(PATIENT %in% tmp$Pheno.SampleID) %>% 
    select(PATIENT, DOB, DATECOL, AGECOL, DIABETES:DIABETESTREAT, FAMGOUT:HIBPTREAT, LIPIDS, HEART:STROKE, KIDNEY:KIDNEY2, SUGDRINK, SMOKER:OTHALCO, WEIGHT, HEIGHT, BMI, URATE:CREAT, DIURETIC:OTHDIURETIC, LIPIDLOWER:BILEACIDSEQ, COMMENT, GOUTCRITERIAB, SUSTOPHUS:OTHDRUG, URATEDOX:DATEDOX, DIABETESAFFSTAT, KIDNEYTRANSPLANT, RENALDISEASE, FASTING:TRIGLY, SURICACID:EGFR) %>% 
    left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>% 
    rename(IID = PATIENT,
           GOUT = Pheno.GoutSummary,
           SEX = Geno.GeneticSex) %>% 
    mutate(across(where(is_character), factor),
           across(c(DIABETES, FAMGOUT, FAMGOUT3, HIBP, LIPIDS, HEART:STROKE, KIDNEY, DIURETIC:OTHDIURETIC, LIPIDLOWER:BILEACIDSEQ, GOUTCRITERIAB, SUSTOPHUS, TOPHUS, ALLOP:COLCHI, DIABETESAFFSTAT, KIDNEYTRANSPLANT, RENALDISEASE), logicfactor),
           DURATION = AGECOL - AGE1ATK + 1,
           TOPHIGOUT = TOPHUS | GOUTCRITERIAB | SUSTOPHUS,
           EROSIONS = NA,
           URATE = case_when(!is.na(SURICACID) ~ SURICACID * 1000 / 59.48,
                             !is.na(URATE) ~ URATE * 1000 / 59.48,
                             TRUE ~ URATEDOX * 1000 / 59.48),
           ULT = ALLOP | PROBEN, 
           PROPHY = STEROID | ANTIINFLAM | COLCHI | OTHDRUG != "no",
           HEIGHT = HEIGHT / 100,
           BMI = case_when(!is.na(BMI) ~ BMI, 
                           TRUE ~ WEIGHT / (HEIGHT * HEIGHT)),
           HYPERTENSION = HIBP | !is.na(HIBPTREAT) | DIURETIC | LOOPDIURETIC | THIAZIDEDIURETIC | OTHDIURETIC | DIURGOUT,
           DIABETES = DIABETES | !is.na(DIABETESTREAT) | DIABETESAFFSTAT,
           HEART = HEART | ANGINA | HEARTFAILURE | HEARTSURGERY | HEARTATTACK,
           CREAT = CREAT / 88.42,
           SCREAT = SCREAT / 88.42,
           CREAT2 = rowMeans(across(c(CREAT, SCREAT)), na.rm = T),
           EGFR = case_when(SEX == "Male" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (CREAT2 ^ -1.154) * (AGECOL ^ -0.203) * 0.742,
                            TRUE ~ EGFR),
           KIDNEY = KIDNEY | !is.na(KIDNEY2) | EGFR < 60 | KIDNEYTRANSPLANT | RENALDISEASE,
           LIPIDS = LIPIDS | LIPIDLOWER | STATIN | FIBRATES | EZETIMIBE | NICOTINICACID | BILEACIDSEQ,
           STROKE = STROKE,
           TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
           SUGDRINK = SUGDRINK,
           CURSMOKE = SMOKER == 2,
           FAMGOUT = FAMGOUT | FAMGOUT3,
           FAMGOUTNUM = FAMGOUT4) %>%
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  # EuroGout
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "EuroGout")
  
  eurogout_pheno <- read_delim(here("Data/Phenotypes/EuroGoutPheno.txt"), delim = "\t") %>% 
    filter(SUBJECT %in% tmp$Pheno.SampleID) %>% 
    select(SUBJECT, RECRUITMENTDATE, DOB:WEIGHT, HEIGHT, BMI, TOPHUS:GOUTNOTES, ACRB, ACRC8, RENALDISEASE, T2DIABETES:HEARTFAILURE, MEDICALCOMMENT, URATETHERAPY:ALLOPURINOL, CHOLCHICINE:TLDIURETICS, ASPRIN, SUGARDRINK:FRUITJUICE, ALCOHOL:PREUTLKURATE, TCHOLESTEROL:TRIGLYCERIDES, EGFR) %>% 
    left_join(tmp, by = c("SUBJECT" = "Pheno.SampleID")) %>% 
    rename(IID = SUBJECT,
           GOUT = Pheno.GoutSummary,
           AGECOL = AGERECRUITMENT,
           SEX = Geno.GeneticSex) %>% 
    mutate(across(where(is_character), factor),
           across(c(TOPHUS, EROSIONS, ACRB, ACRC8, RENALDISEASE, T2DIABETES, HYPERTENSION, DYSLIPIDEMIA, STROKE:HEARTFAILURE, ALLOPURINOL:ASPRIN), 
                  logicfactor2),
           AGE1ATK = case_when(!is.na(AGEFIRSTATTK) ~ AGEFIRSTATTK,
                               TRUE ~ AGECOL - DURATIONGOUT),
           DURATION = AGECOL - AGE1ATK + 1,
           NUMATK = case_when(!is.na(NUMATTACKS) ~ NUMATTACKS, 
                              NUMATTACKS_TXT == ">5" ~ 5, 
                              NUMATTACKS_TXT == "1" ~ 1, 
                              NUMATTACKS_TXT == "2" ~ 2, 
                              NUMATTACKS_TXT == "3" ~ 3, 
                              NUMATTACKS_TXT %in% c("3 to 5", "3-5") ~ 4, 
                              NUMATTACKS_TXT %in% c("reported 'continue' I think. I assume this means ongoing.", "reported 100.") ~ 52, 
                              NUMATTACKS_TXT == "zehn" ~ 10),
           TOPHIGOUT = TOPHUS | NUMTOPHI %in% 1:3 | ACRB | ACRC8,
           URATE = case_when(is.na(SERUMURATE) ~ PREUTLKURATE * 1000 / 59.48,
                             TRUE ~ SERUMURATE * 1000 / 59.48),
           ULT = GOUTNOTES == "Gout assumed, taking allopurinol" | (!is.na(URATETHERAPY) & !(URATETHERAPY %in% c("diet", "NIL", "no", "No uric acid lowering therapy", "none", "None", "NONE", "none listed", "Unclear"))) | ALLOPURINOL, 
           PROPHY = CHOLCHICINE | NSAIDS | ASPRIN,
           HEIGHT = HEIGHT / 100,
           BMI = case_when(!is.na(BMI) ~ BMI, 
                           TRUE ~ WEIGHT / (HEIGHT * HEIGHT)),
           HYPERTENSION = HYPERTENSION | !is.na(HYPERTENTREATM) | MEDICALCOMMENT == "Said no to hypertension but beside BP states is on losartan" | DIURETICS | TLDIURETICS,
           DIABETES = T2DIABETES | !is.na(T2DTREATMENT),
           HEART = MI | IHD | HEARTFAILURE | MEDICALCOMMENT %in% c("Cardiovascular disease", "Heart problems", "Heart problems. EGFR available", "Heart problems. EGFR available. EGFR available. EGFR<60"),
           CREAT = SERUMCREATININE / 88.42,
           EGFR = case_when(SEX == "Male" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742,
                            TRUE ~ EGFR),
           KIDNEY = RENALDISEASE | EGFR < 60,
           LIPIDS = DYSLIPIDEMIA | !is.na(LIPIDTREATMENT),
           STROKE = STROKE,
           TOTALALC = ALCOHOL,
           SUGDRINK = SUGARDRINK + FRUITJUICE,
           CURSMOKE = SMOKER == 1,
           FAMGOUT = FAMILYHISTORY == 1,
           FAMGOUTNUM = NUMFAMILYGOUT) %>% 
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  
  # Gout in Aotearoa
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "Gout in Aotearoa")
  
  aotearoa_pheno <- read_delim(here("Data/Phenotypes/NZPheno.txt"), delim = "\t") %>% 
    filter(SUBJECT %in% tmp$Pheno.SampleID) %>% 
    select(SUBJECT, DATEARR, DOB, AGECOL, DIABETES, FAMGOUT, FAMGOUT3:HIBP, HIBPTREAT:FRUSEMIDE, BUMETANIDE, THIAZIDEDIURETIC:BENDROFLUAZIDE, HCTZ, METOLAZONE, CHLORHALIDONE, INDAPAMIDE, OTHDIURETIC, SPIRONOLACTONE, AMILORIDE, ACETAZOLAMIDE, DIURETICCOMMENT:DIURRECRUIT, LIPIDS, LIPIDLOWER:BILEACIDSEQ, HEART:STROKE, KIDNEY:HEALTHOTH, SUGDRINK, SMOKER:OTHALCO, WEIGHT:HEIGHT, BMI:BMICALC, MRURATE:MRCREATDATE, GOUTCRITERIAB, SUSTOPHUS:DIURGOUT, ALLOPCURRENT, PROBENCURRENT, BENZBROCURRENT, FEBUXCURRENT, OTHULTCURRENT, CURULTCOMMENT:ALLOPINTOLERANCE, ALLOPSIDE, URATEDOX:HIGHESTSUDATE, CHOLES:TRIGLY, SCREAT:SURICACID, URATE1MONTH, RELATEDFILTER:RELATED) %>% 
    left_join(tmp, by = c("SUBJECT" = "Pheno.SampleID")) %>% 
    rename(IID = SUBJECT,
           GOUT = Pheno.GoutSummary,
           SEX = Geno.GeneticSex) %>% 
    mutate(across(where(is_character), factor),
           across(c(FAMGOUT, FAMGOUT3, HIBP, DIURETIC:ACETAZOLAMIDE, LIPIDS:KIDNEY, FATTYLIVER, GOUTCRITERIAB:SUSTOPHUS, TOPHUS, ALLOPCURRENT:OTHULTCURRENT, ALLOPINTOLERANCE), 
                  logicfactor),
           AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK,
                               TRUE ~ AGECOL - DURATION),
           DURATION = AGECOL - AGE1ATK + 1,
           NUMATK = NUMATK,
           TOPHIGOUT = GOUTCRITERIAB | SUSTOPHUS | TOPHUS,
           EROSIONS = NA,
           URATE = case_when(is.na(SURICACID) ~ rowMeans(across(c(MRURATE, URATEDOX, PREULTURATE, HIGHESTSU, URATE1MONTH)), na.rm = TRUE) * 1000 / 59.48,
                             TRUE ~ SURICACID * 1000 / 59.48),
           ULT = ALLOPCURRENT | PROBENCURRENT | BENZBROCURRENT | FEBUXCURRENT | OTHULTCURRENT, 
           PROPHY = NA,
           HEIGHT = HEIGHT / 100,
           BMI = case_when(!is.na(BMI) ~ BMI,
                           TRUE ~ WEIGHT / (HEIGHT * HEIGHT)),
           HYPERTENSION = HIBP | !is.na(HIBPTREAT) | DIURETIC | DIURETICCURRENT | LOOPDIURETIC | FRUSEMIDE | BUMETANIDE | THIAZIDEDIURETIC | BENDROFLUAZIDE | HCTZ | METOLAZONE | CHLORHALIDONE | INDAPAMIDE | OTHDIURETIC | SPIRONOLACTONE | AMILORIDE | ACETAZOLAMIDE | !is.na(DIURETICCOMMENT) | DIURRECRUIT == 2 | DIURGOUT %in% 2:4,
           DIABETES = DIABETES == 2,
           HEART = HEART | ANGINA | HEARTFAILURE | HEARTSURGERY | HEARTATTACK,
           CREAT = rowMeans(across(c(SCREAT, MRCREAT))) / 88.42,
           EGFR = case_when(SEX == "Male" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
           KIDNEY = KIDNEY | !is.na(KIDNEY2) | EGFR < 60,
           LIPIDS = LIPIDS | LIPIDLOWER | STATIN | FIBRATES | EZETIMIBE | NICOTINICACID | BILEACIDSEQ,
           STROKE = STROKE,
           TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
           SUGDRINK = SUGDRINK,
           CURSMOKE = SMOKER == 2,
           FAMGOUT = FAMGOUT,
           FAMGOUTNUM = FAMGOUT4) %>% 
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  
  # LPA (don't have any phenotypes of interest for this study)
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "LPA")
  
  lpa_pheno <- read_delim(here("Data/Phenotypes/LPAPheno.txt"), delim = "\t") %>% 
    filter(SUBJECT %in% tmp$Pheno.SampleID) %>%
    select(SUBJECT:AGE, SMOKING, SMOKEHISTORY, SUGARDRINKS:DIABETESTYPE, MAINHYPERTENSION:DYSLIPIDCOMMENT, MAINSTROKE:MAINSTROKECOM, BMHEIGHT:BMWEIGHT, SERUMCREATININE:SERUMURATE, TOTALCHOLESTEROL, TRIGLYCERIDES) %>% 
    left_join(tmp, by = c("SUBJECT" = "Pheno.SampleID")) %>% 
    rename(IID = SUBJECT,
           GOUT = Pheno.GoutSummary,
           SEX = Geno.GeneticSex,
           AGECOL = AGE) %>% 
    mutate(across(where(is_character), factor),
           across(c(SMOKING:SMOKEHISTORY, MAINDIABETES, MAINHYPERTENSION, DYSLIPIDEMIA), 
                  logicfactor),
           AGE1ATK = NA,
           DURATION = NA,
           NUMATK = NA,
           TOPHIGOUT = NA,
           EROSIONS = NA,
           URATE = SERUMURATE,
           ULT = NA, 
           PROPHY = NA,
           HEIGHT = BMHEIGHT / 100,
           BMI = BMWEIGHT / (HEIGHT * HEIGHT),
           HYPERTENSION = MAINHYPERTENSION,
           DIABETES = DIABETESTYPE == 2,
           HEART = NA,
           CREAT = SERUMCREATININE / 88.42,
           EGFR = case_when(SEX == "Male" ~ 175 * (SERUMCREATININE ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (SERUMCREATININE ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
           KIDNEY = EGFR < 60,
           LIPIDS = DYSLIPIDEMIA,
           STROKE = MAINSTROKE,
           TOTALALC = NA,
           SUGDRINK = SUGARDRINKS,
           CURSMOKE = SMOKING,
           FAMGOUT = NA,
           FAMGOUTNUM = NA) %>% 
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  # Ngati Porou
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "Ngati Porou")
  
  nph_pheno <- read_delim(here("Data/Phenotypes/NPHPheno.txt"), delim = "\t") %>% 
    filter(PATIENT %in% tmp$Pheno.SampleID) %>% 
    select(PATIENT, DOB, CONSENT, DATEARR, AGECOL, DIABETES, FAMGOUT:HIBP, LIPIDS, LIPIDLOWER:STROKE, KIDNEY, SUGDRINK, SMOKER:SPIRITS, WEIGHT:HEIGHT, BMI, URATE:CREATDATE, DIURETICCURRENT:FRUSEMIDE, BUMETANIDE, BENDROFLUAZIDE, HCTZ, METOLAZONE, CHLORHALIDONE, SPIRONOLACTONE, AMILORIDE, COMMENT, GOUTCRITERIAB, SUSTOPHUS, AGE1ATK:ALLOP, STEROID:OTHDRUG, URATEDOX:DATEDOX, RENALTRANSPLANT, DIABETESAFFSTAT, SURICACID:SCREAT, DIURETIC:OTHDIURETIC, STATIN:BILEACIDSEQ, URATELOWERING) %>% 
    left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>% 
    rename(IID = PATIENT,
           GOUT = Pheno.GoutSummary,
           SEX = Geno.GeneticSex) %>% 
    mutate(across(where(is_character), factor),
           across(c(DIABETES, FAMGOUT, FAMGOUT3, HIBP:STROKE, KIDNEY, DIURETICCURRENT:AMILORIDE, GOUTCRITERIAB, SUSTOPHUS, TOPHUS, ALLOP:BENZOBROMARONE, RENALTRANSPLANT, DIABETESAFFSTAT, DIURETIC:URATELOWERING), 
                  logicfactor),
           AGE1ATK = case_when(!is.na(AGE1ATK) ~ AGE1ATK,
                               TRUE ~ AGECOL - DURATION),
           DURATION = AGECOL - AGE1ATK + 1,
           NUMATK = NUMATK,
           TOPHIGOUT = GOUTCRITERIAB | SUSTOPHUS | TOPHUS,
           EROSIONS = NA,
           URATE = case_when(is.na(SURICACID) ~ rowMeans(across(c(URATE, URATEDOX)), na.rm = TRUE) * 1000 / 59.48,
                             TRUE ~ SURICACID * 1000 / 59.48),
           ULT = ALLOP | PROBEN | BENZOBROMARONE | URATELOWERING, 
           PROPHY = STEROID | ANTIINFLAM | COLCHI,
           HEIGHT = HEIGHT / 100,
           BMI = WEIGHT / (HEIGHT * HEIGHT),
           HYPERTENSION = HIBP | DIURETICCURRENT | FRUSEMIDE | BUMETANIDE | BENDROFLUAZIDE | HCTZ | METOLAZONE | CHLORHALIDONE | SPIRONOLACTONE | AMILORIDE | DIURGOUT %in% 2:4 | DIURETIC | LOOPDIURETIC | THIAZIDEDIURETIC | OTHDIURETIC,
           DIABETES = DIABETES | DIABETESAFFSTAT,
           HEART = HEART | ANGINA | HEARTFAILURE | HEARTSURGERY | HEARTATTACK,
           CREAT = rowMeans(across(c(CREAT, SCREAT)), na.rm = TRUE) / 88.42,
           EGFR = case_when(SEX == "Male" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (CREAT ^ -1.154) * (AGECOL ^ -0.203) * 0.742),
           KIDNEY = KIDNEY | EGFR < 60 | RENALTRANSPLANT,
           LIPIDS = LIPIDS | LIPIDLOWER | STATIN | FIBRATES | EZETIMIBE | NICOTINICACID | BILEACIDSEQ,
           STROKE = STROKE,
           TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
           SUGDRINK = SUGDRINK,
           CURSMOKE = SMOKER == 2,
           FAMGOUT = FAMGOUT,
           FAMGOUTNUM = FAMGOUT4) %>% 
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  # RD
  tmp <- CoreExPheno_Final %>% 
    filter(Pheno.Study == "Renal Disease")
  
  rd_pheno <- read_delim(here("Data/Phenotypes/RDPheno.txt"), delim = "\t") %>% 
    filter(PATIENT %in% tmp$Pheno.SampleID) %>% 
    select(PATIENT, DOB, CONSENTDATE, DATECOL, DATEARR, CKDV, RENALTRANSPLANT, DIABETES, FAMGOUT, HYPERTENSION, DYSLIPIDAEMIA, IHD, CVA, CHF, HEALTHOTH:WEIGHT, BMI, SMOKER, SUGDRINK, BEER:SPIRITS, COMMENT, TYPE2D, GOUTCRITERIAB, SUSTOPHUS, AGE1ATK:OTHDRUG, ESSENTIALHYPERT, SURICACID:SCREAT, RCOMMENTS) %>% 
    left_join(tmp, by = c("PATIENT" = "Pheno.SampleID")) %>% 
    rename(IID = PATIENT,
           GOUT = Pheno.GoutSummary,
           SEX = Geno.GeneticSex) %>% 
    mutate(across(where(is_character), factor),
           across(c(RENALTRANSPLANT:CHF, TYPE2D:SUSTOPHUS, TOPHUS, ALLOPURINOL:RASBURICASE), 
                  logicfactor),
           AGECOL = AGECOL,
           AGE1ATK = AGE1ATK,
           DURATION = AGECOL - AGE1ATK + 1,
           NUMATK = NA,
           TOPHIGOUT = GOUTCRITERIAB | SUSTOPHUS | TOPHUS,
           EROSIONS = NA,
           URATE = case_when(is.na(SURICACID) ~ rowMeans(across(c(URATEFIRSTREC, URATEDOX, URATERECENT)), na.rm = TRUE) * 1000 / 59.48,
                             TRUE ~ SURICACID * 1000 / 59.48),
           ULT = ALLOPURINOL | PROBEN | RASBURICASE, 
           PROPHY = STEROID | ANTIINFLAM | COLCHI,
           HEIGHT = HEIGHT / 100,
           BMI = WEIGHT / (HEIGHT * HEIGHT),
           HYPERTENSION = HYPERTENSION | ESSENTIALHYPERT == 1 | DIURGOUT %in% 2:4,
           DIABETES = DIABETES | TYPE2D,
           HEART = IHD | CHF,
           EGFR = case_when(SEX == "Male" ~ 175 * (SCREAT / 88.42) ^ -1.154 * (AGECOL ^ -0.203), 
                            SEX == "Female" ~ 175 * (SCREAT / 88.42) ^ -1.154 * (AGECOL ^ -0.203) * 0.742),
           KIDNEY = CKDV == 1 | RENALTRANSPLANT | EGFR < 60,
           LIPIDS = DYSLIPIDAEMIA,
           STROKE = CVA,
           TOTALALC = rowSums(across(c(BEER, WINE, SPIRITS)), na.rm = TRUE),
           SUGDRINK = SUGDRINK,
           CURSMOKE = SMOKER == 2,
           FAMGOUT = FAMGOUT,
           FAMGOUTNUM = NA) %>% 
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  
  # Combining all cohorts together
  all_pheno <- rbind(agria_pheno, aotearoa_pheno, dm_pheno, eurogout_pheno, ironwood_pheno, lasso_pheno, lpa_pheno, nph_pheno, rd_pheno) %>% 
    mutate(Pheno.Study = factor(Pheno.Study)) %>% 
    arrange(IID) %>% 
    filter(!(duplicated(IID) | duplicated(IID, fromLast = TRUE)))
  
  rm(aotearoa_pheno, agria_pheno, dm_pheno, nph_pheno, rd_pheno, eurogout_pheno, ironwood_pheno, lasso_pheno, tmp, lpa_pheno, CoreExPheno_Final)
  
  # UK Biobank
  load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/self_report_med.RData")
  tmp <- self_report_med %>% 
    mutate(IID = eid,
           ULT = allopurinol | sulphinpyrazone | probenecid) %>% 
    select(IID, ULT)
  
  load("/Volumes/archive/merrimanlab/raid_backup/UKbiobank/decrypted_files/ukb27189_27190_27191_27192_27193_27194_27195_27640_30070_31460_combined_withdrawn_ids_removed_10-07-2019.RData")
  
  test <- refresh_ukbb_data %>% 
    select(eid, body_mass_ind_bmi_f21001_0_0, alcohol_intake_frequency_f1558_0_0, current_tobacco_smoking_f1239_0_0)
  
  load("/Volumes/userdata/student_users/nicksumpter/Documents/PhD/Cluster/final_data_prs.RData")
  ukbb_pheno <- final_data_prs %>% 
    mutate(eid = as.numeric(eid)) %>% 
    left_join(test, by = "eid") %>% 
    rename(IID = eid,
           GOUT = gout,
           AGECOL = age,
           SEX = sex,
           URATE = urate) %>% 
    mutate(Geno.PCVector1 = NA,
           Geno.PCVector2 = NA,
           Geno.PCVector3 = NA,
           Geno.PCVector4 = NA,
           Geno.PCVector5 = NA,
           Geno.PCVector6 = NA,
           Geno.PCVector7 = NA,
           Geno.PCVector8 = NA,
           Geno.PCVector9 = NA,
           Geno.PCVector10 = NA,
           Geno.PCVector1_Oc = NA,
           Geno.PCVector2_Oc = NA,
           Geno.PCVector3_Oc = NA,
           Geno.PCVector4_Oc = NA,
           Geno.PCVector5_Oc = NA,
           Geno.PCVector6_Oc = NA,
           Geno.PCVector7_Oc = NA,
           Geno.PCVector8_Oc = NA,
           Geno.PCVector9_Oc = NA,
           Geno.PCVector10_Oc = NA,
           AGE1ATK = NA,
           DURATION = NA,
           TOPHIGOUT = NA, 
           EROSIONS = NA,
           NUMATK = NA,
           PROPHY = NA,
           Geno.SpecificAncestry = "European",
           BMI = body_mass_ind_bmi_f21001_0_0,
           HYPERTENSION = hypertension,
           DIABETES = type2_diabetes,
           HEART = coronary_heart_disease | heart_failure,
           KIDNEY = ckd_stage3 | ckd_stage4 | end_stage_renal,
           LIPIDS = dyslipidemia,
           STROKE = cerebrovascular_disease,
           TOTALALC = case_when(alcohol_intake_frequency_f1558_0_0 == "Daily or almost daily" ~ 14,
                                alcohol_intake_frequency_f1558_0_0 == "Three or four times a week" ~ 4,
                                alcohol_intake_frequency_f1558_0_0 == "Once or twice a week" ~ 2,
                                TRUE ~ NA_real_),
           SUGDRINK = NA,
           CURSMOKE = current_tobacco_smoking_f1239_0_0 == "Yes, on most or all days",
           FAMGOUT = NA,
           FAMGOUTNUM = NA,
           Geno.SampleID = NA,
           Pheno.Study = "UK Biobank") %>% 
    left_join(tmp) %>% 
    select(IID, GOUT, AGECOL, SEX, Geno.PCVector1:Geno.PCVector10, Geno.PCVector1_Oc:Geno.PCVector10_Oc, AGE1ATK, DURATION, TOPHIGOUT, EROSIONS, NUMATK, URATE, ULT, PROPHY, Geno.SpecificAncestry, BMI, HYPERTENSION, DIABETES, HEART, KIDNEY, LIPIDS, STROKE, TOTALALC, SUGDRINK, CURSMOKE, FAMGOUT, FAMGOUTNUM, Pheno.Study, Geno.SampleID)
  
  save(all_pheno, ukbb_pheno, file = here("Output/Phenotypes.RData"))
}

The imputed PRS was calculated for all European samples. Using the results of the UKBB GWAS, 28 PRS metrics were generated, this includes the complete PRS and the 27 variants individually.

# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/merged_PRS_UKBB.map"), 
                  delim = "\t", 
                  col_names = FALSE)
x <- read_delim(here("Output/Temp/merged_PRS_UKBB.ped"), 
                delim = " ", 
                col_names = FALSE, 
                col_types = cols(.default = col_character()))

colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = ncol(x), by = 2)] <- str_c(map$X2, "_2")

x <- x %>% 
  filter(IID %in% all_pheno$Geno.SampleID)

# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)

for(i in 1:nrow(UKBB_Gene_OR)) {
  x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
    str_replace("0", "NA") %>%
    str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
    str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
    as.numeric()
  x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
    str_replace("0", "NA") %>%
    str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
    str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
    as.numeric()
  x <- x %>%
    mutate("TEMP" = x[[2 * i + 5]] + x[[2 * i + 6]])
  colnames(x) <- c(colnames(x[1:(num_cols - 1 + i)]), UKBB_Gene_OR[[i, "RSID"]])
}

x <- x %>%
  select(2, (num_cols + 1):ncol(x))

# Save this dataframe for individual SNP analysis
x1 <- x

# Now to calculate the PRS
for(i in 1:nrow(UKBB_Gene_OR)) {
  x[i + 1] <- x[[i + 1]] * log(UKBB_Gene_OR[[i, "OR"]])
}

x$PRS <- rowSums(x[2:ncol(x)])

x <- x %>%
  select(IID, PRS) %>% 
  left_join(x1)

all_pheno_prs <- all_pheno %>% 
    left_join(x, by = c("Geno.SampleID" = "IID"))



# UK Biobank
# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/merged_PRS.map"), 
                  delim = "\t", 
                  col_names = FALSE) %>% 
  separate(X2, into = c("X2", NA), sep = ",")

x <- read_delim(here("Output/Temp/merged_PRS.ped"), 
                delim = " ", 
                col_names = FALSE, 
                col_types = cols(.default = col_character()))

colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = ncol(x), by = 2)] <- str_c(map$X2, "_2")

tmp <- c("FID", "IID", "PID", "MID", "SEX", "AFF", str_c(UKBB_Gene_OR$RSID, "_1"), str_c(UKBB_Gene_OR$RSID, "_2"))

x <- x %>% 
  select(which(colnames(x) %in% tmp)) %>% 
  mutate(IID = as.numeric(IID)) %>% 
  filter(IID %in% ukbb_pheno$IID)

# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)

for(i in 1:nrow(UKBB_Gene_OR)) {
  x[[2 * i + 5]] <- x[[2 * i + 5]] %>%
    str_replace("0", "NA") %>%
    str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
    str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
    as.numeric()
  x[[2 * i + 6]] <- x[[2 * i + 6]] %>%
    str_replace("0", "NA") %>%
    str_replace(UKBB_Gene_OR[[i, "Effect_Allele"]], "1") %>%
    str_replace(UKBB_Gene_OR[[i, "Alternate_Allele"]], "0") %>%
    as.numeric()
  x <- x %>%
    mutate("TEMP" = x[[2 * i + 5]] + x[[2 * i + 6]])
  colnames(x) <- c(colnames(x[1:(num_cols - 1 + i)]), UKBB_Gene_OR[[i, "RSID"]])
}

x <- x %>%
  select(2, (num_cols + 1):ncol(x))

# Save this dataframe for individual SNP analysis
x1 <- x

# Now to calculate the PRS
for(i in 1:nrow(UKBB_Gene_OR)) {
  x[i + 1] <- x[[i + 1]] * log(UKBB_Gene_OR[[i, "OR"]])
}

x$PRS <- rowSums(x[2:ncol(x)])

x <- x %>%
  select(IID, PRS) %>% 
  left_join(x1)

ukbb_pheno_prs <- ukbb_pheno %>% 
  left_join(x, by = c("IID")) %>% 
  mutate(IID = factor(IID))

all_pheno_prs <- all_pheno_prs %>% 
  mutate(GOUT = GOUT == "Gout") %>% 
  full_join(ukbb_pheno_prs)

rm(all_pheno, ukbb_pheno, ukbb_pheno_prs, map, x, x1, i, num_cols, tmp)

The directly genotyped PRS was then calculated for all European and Polynesian samples. Using the results of the UKBB GWAS (only using genotyped variants), 19 PRS metrics were generated, this includes the complete PRS and the 18 variants individually.

# Load in plink genotype files + rename column names + filter to only include IDs of cohort of interest
map <- read_delim(here("Output/Temp/Poly_SNPs.map"), 
                  delim = "\t", 
                  col_names = FALSE)

x <- read_delim(here("Output/Temp/Poly_SNPs.ped"), 
                delim = " ", 
                col_names = FALSE, 
                col_types = cols(.default = col_character()))

colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = ncol(x), by = 2)] <- str_c(map$X2, "_2")

x <- x %>% filter(IID %in% (all_pheno_prs %>% filter(Pheno.Study != "UK Biobank") %>% pull(Geno.SampleID)))

# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)
for (i in 1:nrow(Poly_Gene_OR)) {
  x[[2 * i + 5]] <- x[[2 * i + 5]] %>% 
    str_replace("0", "NA") %>% 
    str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>% 
    str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>% 
    as.numeric()
  x[[2 * i + 6]] <- x[[2 * i + 6]] %>% 
    str_replace("0", "NA") %>% 
    str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>% 
    str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>% 
    as.numeric()
  x <- x %>% 
    mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
  colnames(x) <- c(colnames(x[1:((num_cols - 1) + i)]), Poly_Gene_OR[[i, "RSID"]])
}
x <- x %>% 
  select(2, (num_cols + 1):ncol(x))

# Save this dataframe for individual SNP analysis
x1 <- x

# Now to calculate the PRS
for(i in 1:nrow(Poly_Gene_OR)) {
  x[i + 1] <- x[[i + 1]] * log(Poly_Gene_OR[[i, "OR"]])
}

x$PRS2 <- rowSums(x[2:(ncol(x))])

x <- x %>% 
  select(IID, PRS2) %>% 
  left_join(x1)

all_pheno_prs_direct1 <- all_pheno_prs %>% 
  filter(Pheno.Study != "UK Biobank") %>% 
  rename(PRS1 = PRS) %>%
  full_join(x, by = c("Geno.SampleID" = "IID")) %>% 
  filter(!is.na(PRS1)) %>% 
  select(-ends_with(".y")) %>% 
  rename_with(~ str_remove(.x, ".x"), ends_with(".x"))

all_pheno_prs_direct2 <- all_pheno_prs %>% 
  filter(Pheno.Study != "UK Biobank") %>% 
  rename(PRS1 = PRS) %>%
  full_join(x, by = c("Geno.SampleID" = "IID")) %>% 
  filter(is.na(PRS1)) %>% 
  select(-ends_with(".x")) %>% 
  rename_with(~ str_remove(.x, ".y"), ends_with(".y"))



# UK Biobank
map <- read_delim(here("Output/Temp/merged_PRS.map"), 
                  delim = "\t", 
                  col_names = FALSE) %>% 
  separate(X2, into = c("X2", NA), sep = ",")

x <- read_delim(here("Output/Temp/merged_PRS.ped"), 
                delim = " ", 
                col_names = FALSE, 
                col_types = cols(.default = col_character()))

colnames(x)[1:6] <- c("FID", "IID", "PID", "MID", "SEX", "AFF")
colnames(x)[seq(from = 7, to = ncol(x) - 1, by = 2)] <- str_c(map$X2, "_1")
colnames(x)[seq(from = 8, to = ncol(x), by = 2)] <- str_c(map$X2, "_2")

tmp <- c("FID", "IID", "PID", "MID", "SEX", "AFF", str_c(Poly_Gene_OR$RSID, "_1"), str_c(Poly_Gene_OR$RSID, "_2"))

ids <- all_pheno_prs %>% filter(Pheno.Study == "UK Biobank") %>% pull(IID)

x <- x %>% 
  select(which(colnames(x) %in% tmp)) %>% 
  mutate(IID = as.numeric(IID)) %>% 
  filter(IID %in% ids)

# Convert character genotypes into numeric genotypes based on risk allele = 1
num_cols <- ncol(x)

for(i in 1:nrow(Poly_Gene_OR)) {
  x[[2 * i + 5]] <- x[[2 * i + 5]] %>% 
      str_replace("0", "NA") %>% 
    str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>% 
    str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>% 
    as.numeric()
  x[[2 * i + 6]] <- x[[2 * i + 6]] %>% 
      str_replace("0", "NA") %>% 
    str_replace(Poly_Gene_OR[[i, "Effect_Allele"]], "1") %>% 
    str_replace(Poly_Gene_OR[[i, "Alternate_Allele"]], "0") %>% 
    as.numeric()
  x <- x %>% 
    mutate("TEMP" = (x[[2 * i + 5]] + x[[2 * i + 6]]))
  colnames(x) <- c(colnames(x[1:(num_cols - 1 + i)]), Poly_Gene_OR[[i, "RSID"]])
}

x <- x %>% 
  select(2, (num_cols + 1):ncol(x))

# Save this dataframe for individual SNP analysis
x1 <- x

# Now to calculate the PRS
for(i in 1:nrow(Poly_Gene_OR)) {
  x[i + 1] <- x[[i + 1]] * log(Poly_Gene_OR[[i, "OR"]])
}

x$PRS2 <- rowSums(x[2:ncol(x)])

x <- x %>% 
  select(IID, PRS2) %>% 
  left_join(x1)

x <- x %>% 
  mutate(IID = factor(IID))

all_pheno_prs_direct3 <- all_pheno_prs %>% 
  filter(Pheno.Study == "UK Biobank") %>% 
  rename(PRS1 = PRS) %>%
  full_join(x, by = "IID") %>% 
  select(-ends_with(".y")) %>% 
  rename_with(~ str_remove(.x, ".x"), ends_with(".x"))

all_pheno_prs <- all_pheno_prs_direct1 %>% 
  full_join(all_pheno_prs_direct2) %>% 
  full_join(all_pheno_prs_direct3)

save(all_pheno_prs, file = here("Output/all_pheno_prs.RData"))

rm(all_pheno_prs_direct1, all_pheno_prs_direct2, map, x, x1, i, ids, num_cols, tmp)


Data Exploration

The purpose of this document is to explore the data that was generated in the PRS_Dataprep.Rmd file.

After loading in the data, I first produced a table of missing data for all of my variables of interest. I used this as evidence for filtering individuals. After filtering the data, and removing certain phenotypes that had too much missing data, I then re-observed the missingness and reported the proportion for each variable.

Next, I produced a table that shows the distribution of all variables of interest in all cohorts. I then plotted each of these distributions in all cohorts. Next, I plotted the relationship between several combinations of variables. Finally, I plotted the minor allele frequency of all SNPs in each cohort.

There is also code for observing the relationship of serum urate measurements over time, though this is not for this project.

load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Poly_Gene_OR.RData"))

load(here("Output/all_pheno_prs.RData"))

# Making FLARE_CAT variable and setting all control gout severity traits to NA and removing any non-Europeans with an imputed PRS
all_pheno_prs <- all_pheno_prs %>%
  mutate(FLARE_CAT = factor(case_when(between(NUMATK, 0, 5) ~ paste0(as.character(NUMATK), " flares in last year"),
                                      between(NUMATK, 6, 11) ~ "One every one to two months", 
                                      between(NUMATK, 12, 52) ~ "One or more per month"),
                            levels = c(paste0(0:5, " flares in last year"),
                                       "One every one to two months",
                                       "One or more per month"),
                            labels = c(paste0(0:5), 
                                       "6 - 11",
                                       "12 - 52"),
                            ordered = TRUE),
         AGE1ATK = case_when(GOUT ~ AGE1ATK),
         DURATION = case_when(GOUT ~ DURATION),
         NUMATK = case_when(GOUT ~ NUMATK),
         TOPHIGOUT = case_when(GOUT ~ TOPHIGOUT),
         ULT = case_when(GOUT ~ ULT)) %>% 
  filter(Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") | !(Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")) & is.na(PRS1))

all_pheno_prs_male <- all_pheno_prs %>% 
  filter(SEX == "Male")

all_pheno_prs_female <- all_pheno_prs %>% 
  filter(SEX == "Female")

cohortstring <- c("UK Biobank - Gout - Male",
                  "UK Biobank - Gout - Female",
                  "UK Biobank - Control - Male",
                  "UK Biobank - Control - Female",
                  "Aus/NZ European - Gout - Male",
                  "Aus/NZ European - Gout - Female",
                  "Aus/NZ European - Control - Male",
                  "Aus/NZ European - Control - Female",
                  "GlobalGout - Gout - Male",
                  "GlobalGout - Gout - Female",
                  "GlobalGout - Control - Male",
                  "GlobalGout - Control - Female",
                  "Ardea - LASSO - Male",
                  "Ardea - LASSO - Female",
                  "Ardea - CLEAR1 - Male",
                  "Ardea - CLEAR1 - Female",
                  "Ardea - CLEAR2 - Male",
                  "Ardea - CLEAR2 - Female",
                  "Ardea - CRYSTAL - Male",
                  "Ardea - CRYSTAL - Female",
                  "Ardea - LIGHT - Male",
                  "Ardea - LIGHT - Female",
                  "East Polynesian - Gout - Male",
                  "East Polynesian - Gout - Female",
                  "East Polynesian - Control - Male",
                  "East Polynesian - Control - Female",
                  "West Polynesian - Gout - Male",
                  "West Polynesian - Gout - Female",
                  "West Polynesian - Control - Male",
                  "West Polynesian - Control - Female")

data_list <- list(all_pheno_prs_male %>% filter(GOUT,
                                                Pheno.Study == "UK Biobank"),
                  all_pheno_prs_female %>% filter(GOUT,
                                                  Pheno.Study == "UK Biobank"),
                  all_pheno_prs_male %>% filter(!GOUT,
                                                Pheno.Study == "UK Biobank"),
                  all_pheno_prs_female %>% filter(!GOUT,
                                                  Pheno.Study == "UK Biobank"),
                  all_pheno_prs_male %>% filter(GOUT,
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs_female %>% filter(GOUT,
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                  Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs_male %>% filter(!GOUT,
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs_female %>% filter(!GOUT,
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                  Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs_male %>% filter(GOUT,
                                                Pheno.Study == "EuroGout",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_female %>% filter(GOUT,
                                                  Pheno.Study == "EuroGout",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_male %>% filter(!GOUT,
                                                Pheno.Study == "EuroGout",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_female %>% filter(!GOUT,
                                                  Pheno.Study == "EuroGout",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: 401",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: 401",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs_male %>% filter(GOUT,
                                                Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs_female %>% filter(GOUT,
                                                  Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs_male %>% filter(!GOUT,
                                                Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs_female %>% filter(!GOUT,
                                                  Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs_male %>% filter(GOUT,
                                                Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  all_pheno_prs_female %>% filter(GOUT,
                                                  Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  all_pheno_prs_male %>% filter(!GOUT,
                                                Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  all_pheno_prs_female %>% filter(!GOUT,
                                                  Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))

clean_names <- c("UK&nbsp;Biobank<br/>Gout<br/>Male",
                 "UK&nbsp;Biobank<br/>Gout<br/>Female",
                 "UK&nbsp;Biobank<br/>Control<br/>Male",
                 "UK&nbsp;Biobank<br/>Control<br/>Female",
                 "Aus/NZ&nbsp;European<br/>Gout<br/>Male",
                 "Aus/NZ&nbsp;European<br/>Gout<br/>Female",
                 "Aus/NZ&nbsp;European<br/>Control<br/>Male",
                 "Aus/NZ&nbsp;European<br/>Control<br/>Female",
                 "GlobalGout<br/>Gout<br/>Male",
                 "GlobalGout<br/>Gout<br/>Female",
                 "GlobalGout<br/>Control<br/>Male",
                 "GlobalGout<br/>Control<br/>Female",
                 "Ardea<br/>LASSO<br/>Gout<br/>Male",
                 "Ardea<br/>LASSO<br/>Gout<br/>Female",
                 "Ardea<br/>CLEAR1<br/>Gout<br/>Male",
                 "Ardea<br/>CLEAR1<br/>Gout<br/>Female",
                 "Ardea<br/>CLEAR2<br/>Gout<br/>Male",
                 "Ardea<br/>CLEAR2<br/>Gout<br/>Female",
                 "Ardea<br/>CRYSTAL<br/>Gout<br/>Male",
                 "Ardea<br/>CRYSTAL<br/>Gout<br/>Female",
                 "Ardea<br/>LIGHT<br/>Gout<br/>Male",
                 "Ardea<br/>LIGHT<br/>Gout<br/>Female",
                 "East&nbsp;Polynesian<br/>Gout<br/>Male",
                 "East&nbsp;Polynesian<br/>Gout<br/>Female",
                 "East&nbsp;Polynesian<br/>Control<br/>Male",
                 "East&nbsp;Polynesian<br/>Control<br/>Female",
                 "West&nbsp;Polynesian<br/>Gout<br/>Male",
                 "West&nbsp;Polynesian<br/>Gout<br/>Female",
                 "West&nbsp;Polynesian<br/>Control<br/>Male",
                 "West&nbsp;Polynesian<br/>Control<br/>Female")
# Functions 
report <- function(x) {
    if(sum(is.na(x)) != length(x)) {
      paste0(sprintf(mean(x, na.rm = TRUE), fmt = "%#.1f"), " ± ", sprintf(sd(x, na.rm = TRUE), fmt = "%#.1f"))
    } else {
      paste0("NA")
    }
}

report_median <- function(x) {
    if(sum(is.na(x)) != length(x)) {
      paste0(median(x, na.rm =T), " (", summary(x)[[2]], " - ", summary(x)[[5]], ")")
    } else {
      paste0("NA")
    }
}

sumreport <- function(x) {
  if(sum(is.na(x)) != length(x)){
    paste0(sum(x, na.rm = TRUE), " (", sprintf((mean(x, na.rm = TRUE) * 100), fmt = "%#.1f"), ")")
  } else {
      paste0("NA")
  }
}

transpose_df <- function(df) {
  t_df <- data.table::transpose(df)
  colnames(t_df) <- rownames(df)
  rownames(t_df) <- colnames(df)
  t_df <- t_df %>%
    rownames_to_column() %>%
    as_tibble() %>% 
    row_to_names(row_number = 1)
  return(t_df)
}

missing <- function(x){
  if(sum(is.na(x)) == length(x)) {
    return("All")
    } else if(sum(!is.na(x)) == length(x)){
      return("None")
      } else {
      paste0(format(sum(is.na(x)), big.mark = ","), " (", format(round((sum(is.na(x)) / length(x) * 100), digits = 1), nsmall = 1), ")")
  }
}


Table of Missing Data Percentages

table1 <- tibble("Cohort" = cohortstring,
                 "N" = unlist(lapply(data_list, function(x) nrow(x))),
                 "Age at Collection" = unlist(lapply(data_list, function(x) missing(x$AGECOL))),
                 "Serum Urate" = unlist(lapply(data_list, function(x) missing(x$URATE))),
                 "ULT" = unlist(lapply(data_list, function(x) missing(x$ULT))),
                 "Age at Onset" = unlist(lapply(data_list, function(x) missing(x$AGE1ATK))),
                 "Disease Duration" = unlist(lapply(data_list, function(x) missing(x$DURATION))),
                 "Flares" = unlist(lapply(data_list, function(x) missing(x$NUMATK))),
                 "Tophi" = unlist(lapply(data_list, function(x) missing(x$TOPHIGOUT))),
                 "PRS - Imputed" = unlist(lapply(data_list, function(x) missing(x$PRS1))),
                 "PRS - Direct" = unlist(lapply(data_list, function(x) missing(x$PRS2))),
                 "Prophylaxis" = unlist(lapply(data_list, function(x) missing(x$PROPHY))),
                 "BMI" = unlist(lapply(data_list, function(x) missing(x$BMI))),
                 "Hypertension" = unlist(lapply(data_list, function(x) missing(x$HYPERTENSION))),
                 "Type 2 Diabetes" = unlist(lapply(data_list, function(x) missing(x$DIABETES))),
                 "Heart Disease" = unlist(lapply(data_list, function(x) missing(x$HEART))),
                 "Kidney Disease" = unlist(lapply(data_list, function(x) missing(x$KIDNEY))),
                 "Dyslipidemia" = unlist(lapply(data_list, function(x) missing(x$LIPIDS))),
                 "Stroke" = unlist(lapply(data_list, function(x) missing(x$STROKE))),
                 "Alcoholic Drinks / Week" = unlist(lapply(data_list, function(x) missing(x$TOTALALC))),
                 "Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list, function(x) missing(x$SUGDRINK))),
                 "Current Smoker" = unlist(lapply(data_list, function(x) missing(x$CURSMOKE))),
                 "Family History of Gout" = unlist(lapply(data_list, function(x) missing(x$FAMGOUT))),
                 "No. Relatives w/ Gout" = unlist(lapply(data_list, function(x) missing(x$FAMGOUTNUM))))

table1 <- transpose_df(table1) %>% 
  column_to_rownames(var = "Cohort") %>% 
  mutate(across(.cols = 1:ncol(table1), ~ str_replace_all(string = .x, pattern = " ", replacement = "&nbsp;")))

row.names(table1) <- str_replace_all(row.names(table1), " ", "&nbsp;")

table1 %>% 
  kable(col.names = clean_names,
        align = "c",
        escape = F) %>% 
  kable_styling("striped") %>% 
  scroll_box(width = "900px", height = "475px") %>% 
  footnote("'All' = all missing, 'None' = none missing")
UK Biobank
Gout
Male
UK Biobank
Gout
Female
UK Biobank
Control
Male
UK Biobank
Control
Female
Aus/NZ European
Gout
Male
Aus/NZ European
Gout
Female
Aus/NZ European
Control
Male
Aus/NZ European
Control
Female
GlobalGout
Gout
Male
GlobalGout
Gout
Female
GlobalGout
Control
Male
GlobalGout
Control
Female
Ardea
LASSO
Gout
Male
Ardea
LASSO
Gout
Female
Ardea
CLEAR1
Gout
Male
Ardea
CLEAR1
Gout
Female
Ardea
CLEAR2
Gout
Male
Ardea
CLEAR2
Gout
Female
Ardea
CRYSTAL
Gout
Male
Ardea
CRYSTAL
Gout
Female
Ardea
LIGHT
Gout
Male
Ardea
LIGHT
Gout
Female
East Polynesian
Gout
Male
East Polynesian
Gout
Female
East Polynesian
Control
Male
East Polynesian
Control
Female
West Polynesian
Gout
Male
West Polynesian
Gout
Female
West Polynesian
Control
Male
West Polynesian
Control
Female
N 8394 995 178004 214885 986 211 717 595 1555 211 45 76 783 65 224 16 233 8 170 4 104 9 582 172 308 466 480 68 223 215
Age at Collection None None None None None None None None 108 (6.9) 14 (6.6) None None 5 (0.6) None None None None None None None None None None None None None None None None 1 (0.5)
Serum Urate 393 (4.7) 59 (5.9) 8,374 (4.7) 10,379 (4.8) 34 (3.4) 9 (4.3) 25 (3.5) 14 (2.4) 264 (17.0) 47 (22.3) 30 (66.7) 69 (90.8) 4 (0.5) 1 (1.5) 1 (0.4) None 1 (0.4) None None None None None 5 (0.9) 2 (1.2) 36 (11.7) 72 (15.5) 2 (0.4) 2 (2.9) 23 (10.3) 18 (8.4)
ULT None None All All 433 (43.9) 105 (49.8) All All 654 (42.1) 102 (48.3) All All 5 (0.6) 1 (1.5) None None None None 77 (45.3) 3 (75.0) None None 163 (28.0) 44 (25.6) All All 136 (28.3) 20 (29.4) All All
Age at Onset All All All All 100 (10.1) 39 (18.5) All All 556 (35.8) 91 (43.1) All All 5 (0.6) None None None None None None None None None 45 (7.7) 20 (11.6) All All 33 (6.9) 17 (25.0) All All
Disease Duration All All All All 100 (10.1) 39 (18.5) All All 556 (35.8) 91 (43.1) All All 5 (0.6) None None None None None None None None None 45 (7.7) 20 (11.6) All All 33 (6.9) 17 (25.0) All All
Flares All All All All 172 (17.4) 50 (23.7) All All 604 (38.8) 91 (43.1) All All 5 (0.6) None None None None None None None None None 53 (9.1) 28 (16.3) All All 36 (7.5) 16 (23.5) All All
Tophi All All All All 255 (25.9) 56 (26.5) All All 1,011 (65.0) 137 (64.9) All All 5 (0.6) None 1 (0.4) None None None None None None None 130 (22.3) 48 (27.9) All All 55 (11.5) 13 (19.1) All All
PRS - Imputed 1,203 (14.3) 135 (13.6) 26,290 (14.8) 31,469 (14.6) None None None None None None None None None None None None None None None None None None All All All All All All All All
PRS - Direct 448 (5.3) 45 (4.5) 9,935 (5.6) 11,720 (5.5) 7 (0.7) None 3 (0.4) 2 (0.3) 120 (7.7) 15 (7.1) None None 2 (0.3) None 1 (0.4) None None None None None 2 (1.9) None 4 (0.7) 1 (0.6) 2 (0.6) 6 (1.3) 7 (1.5) None 5 (2.2) 2 (0.9)
Prophylaxis All All All All 920 (93.3) 194 (91.9) 716 (99.9) All 731 (47.0) 116 (55.0) All All None None None None None None None None None None 377 (64.8) 126 (73.3) 306 (99.4) All 419 (87.3) 56 (82.4) All 214 (99.5)
BMI 31 (0.4) 8 (0.8) 622 (0.3) 641 (0.3) 76 (7.7) 19 (9.0) 172 (24.0) 46 (7.7) 456 (29.3) 77 (36.5) All All 9 (1.1) None 1 (0.4) None None None None None None None 12 (2.1) 6 (3.5) 7 (2.3) 21 (4.5) 14 (2.9) 6 (8.8) 3 (1.3) 10 (4.7)
Hypertension None None None None 378 (38.3) 37 (17.5) 454 (63.3) 283 (47.6) 799 (51.4) 109 (51.7) All All None None 1 (0.4) None None None None None None None 167 (28.7) 23 (13.4) 220 (71.4) 309 (66.3) 209 (43.5) 16 (23.5) 176 (78.9) 161 (74.9)
Type 2 Diabetes 847 (10.1) 100 (10.1) 19,430 (10.9) 27,579 (12.8) 101 (10.2) 24 (11.4) 305 (42.5) 279 (46.9) 596 (38.3) 104 (49.3) All All None None None None None None None None None None 107 (18.4) 24 (14.0) 49 (15.9) 65 (13.9) 18 (3.8) 2 (2.9) 7 (3.1) 11 (5.1)
Heart Disease None None None None 224 (22.7) 35 (16.6) 366 (51.0) 303 (50.9) 1,041 (66.9) 133 (63.0) All All None None 1 (0.4) None None None None None None None 99 (17.0) 18 (10.5) 62 (20.1) 97 (20.8) 39 (8.1) 5 (7.4) 52 (23.3) 18 (8.4)
Kidney Disease 379 (4.5) 48 (4.8) 8,161 (4.6) 10,107 (4.7) 234 (23.7) 44 (20.9) 345 (48.1) 226 (38.0) 1,029 (66.2) 125 (59.2) All All 9 (1.1) None 2 (0.9) None None None None None None None 194 (33.3) 49 (28.5) 250 (81.2) 379 (81.3) 83 (17.3) 12 (17.6) 181 (81.2) 166 (77.2)
Dyslipidemia None None None None 381 (38.6) 64 (30.3) 410 (57.2) 184 (30.9) 666 (42.8) 112 (53.1) All All None None None None None None 1 (0.6) None None None 213 (36.6) 51 (29.7) 209 (67.9) 334 (71.7) 161 (33.5) 18 (26.5) 158 (70.9) 135 (62.8)
Stroke None None None None 310 (31.4) 63 (29.9) 256 (35.7) 85 (14.3) 1,032 (66.4) 157 (74.4) All All None None None None None None None None None None 141 (24.2) 30 (17.4) 70 (22.7) 90 (19.3) 53 (11.0) 7 (10.3) 49 (22.0) 18 (8.4)
Alcoholic Drinks / Week 1,173 (14.0) 472 (47.4) 36,713 (20.6) 75,187 (35.0) 8 (0.8) 1 (0.5) 120 (16.7) 219 (36.8) 1,026 (66.0) 141 (66.8) All All All All All All All All All All All All 42 (7.2) 3 (1.7) 1 (0.3) 6 (1.3) 45 (9.4) 3 (4.4) 4 (1.8) 2 (0.9)
Sugar-Sweetened Drinks / Week All All All All 121 (12.3) 27 (12.8) 171 (23.8) 48 (8.1) 1,295 (83.3) 180 (85.3) All All All All All All All All All All All All 87 (14.9) 17 (9.9) 6 (1.9) 12 (2.6) 102 (21.2) 7 (10.3) 5 (2.2) 3 (1.4)
Current Smoker None None None None 459 (46.6) 120 (56.9) 245 (34.2) 127 (21.3) 977 (62.8) 126 (59.7) All All All All 2 (0.9) None None None None None 3 (2.9) None 289 (49.7) 83 (48.3) 138 (44.8) 235 (50.4) 332 (69.2) 26 (38.2) 99 (44.4) 77 (35.8)
Family History of Gout All All All All 101 (10.2) 23 (10.9) 397 (55.4) 304 (51.1) 652 (41.9) 92 (43.6) 33 (73.3) 69 (90.8) All All All All All All All All All All 82 (14.1) 19 (11.0) 49 (15.9) 68 (14.6) 67 (14.0) 5 (7.4) 32 (14.3) 31 (14.4)
No. Relatives w/ Gout All All All All 323 (32.8) 78 (37.0) 509 (71.0) 457 (76.8) 1,170 (75.2) 168 (79.6) All All All All All All All All All All All All 166 (28.5) 44 (25.6) 90 (29.2) 107 (23.0) 103 (21.5) 21 (30.9) 52 (23.3) 61 (28.4)
Note: ‘All’ = all missing, ‘None’ = none missing
#datatable(table1, extension = "Responsive")

From this table we can tell the following:

  1. Age at collection is only missing in GlobalGout, LASSO and West Polynesian Controls, and thus anybody missing this variable should be removed

  2. Serum urate has less than 5% missingness in all European cohorts except for GlobalGout (with 22%), Polynesian controls also have approximately 10% missingness

  3. ULT data seems to be missing for around 50% of people in Gout in Aotearoa, GlobalGout, and the CRYSTAL trial but is present for all other cohorts

  4. Age at onset (and disease duration) are missing at around 10% or less for all cohorts except GlobalGout, which has over 40% missingness for this variable

  5. Flare data is missing at up to 20% for most cohorts, but around 43% missing in GlobalGout

  6. Tophi data is not missing in Ardea, around 10 - 25% missing in Gout in Aotearoa, and 67% missing in GlobalGout

  7. The imputed PRS is well phenotyped in all cohorts except for UK Biobank (with 15% missingness)

  8. The directly genotyped PRS is missing in up to 7% of each cohort

  9. Prophylaxis data is not well phenotyped in Gout in Aotearoa but the Ardea cohort and potentially the GlobalGout cohort could incorporate this data into flare models

  10. Comorbidities are inconsistently phenotyped, but they should still be useful for describing the cohorts

  11. Lifestyle factors are not well phenotyped in anything but the Gout in Aotearoa cohort so might not be useful

  12. Family history of gout data is not phenotyped in any Ardea cohorts, and is inconsistently phenotyped in Gout in Aotearoa and GlobalGout

Therefore, I will take the following actions:

  1. I will remove anyone missing age at collection

  2. I will remove anyone missing all three severity traits (excluding controls and UK Biobank)

  3. I will remove any Europeans missing the Imputed PRS

  4. I will remove anyone missing the directly genotyped PRS

  5. For any models involving serum urate, I should exclude those missing ULT, then stratify based on ULT usage

  6. For any models involving flares, I should do additional analyses to stratify based on ULT and/or prophylaxis. Also alcoholic drinks per week should be tested for association with flares

  7. For tophus models, I should do additional analyses adjusting for disease duration

  8. Family history of gout could be tested for association with disease severity (also PRS)


all_pheno_prs2 <- all_pheno_prs %>% 
  filter(!is.na(AGECOL),
         !is.na(PRS2),
         !(is.na(PRS1) & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
         (Pheno.Study == "UK Biobank" | !GOUT | GOUT & !(is.na(AGE1ATK) & is.na(NUMATK) & is.na(TOPHIGOUT))))

all_pheno_prs2_male <- all_pheno_prs2 %>% 
  filter(SEX == "Male")

all_pheno_prs2_female <- all_pheno_prs2 %>% 
  filter(SEX == "Female")

data_list2 <- list(all_pheno_prs2_male %>% filter(GOUT,
                                                Pheno.Study == "UK Biobank"),
                  all_pheno_prs2_female %>% filter(GOUT,
                                                  Pheno.Study == "UK Biobank"),
                  all_pheno_prs2_male %>% filter(!GOUT,
                                                Pheno.Study == "UK Biobank"),
                  all_pheno_prs2_female %>% filter(!GOUT,
                                                  Pheno.Study == "UK Biobank"),
                  all_pheno_prs2_male %>% filter(GOUT,
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs2_female %>% filter(GOUT,
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                  Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs2_male %>% filter(!GOUT,
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs2_female %>% filter(!GOUT,
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                  Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  all_pheno_prs2_male %>% filter(GOUT,
                                                Pheno.Study == "EuroGout",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_female %>% filter(GOUT,
                                                  Pheno.Study == "EuroGout",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_male %>% filter(!GOUT,
                                                Pheno.Study == "EuroGout",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_female %>% filter(!GOUT,
                                                  Pheno.Study == "EuroGout",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: 401",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: 401",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_male %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_female %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  all_pheno_prs2_male %>% filter(GOUT,
                                                Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs2_female %>% filter(GOUT,
                                                  Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs2_male %>% filter(!GOUT,
                                                Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs2_female %>% filter(!GOUT,
                                                  Geno.SpecificAncestry %in% c("East Polynesian")),
                  all_pheno_prs2_male %>% filter(GOUT,
                                                Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  all_pheno_prs2_female %>% filter(GOUT,
                                                  Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  all_pheno_prs2_male %>% filter(!GOUT,
                                                Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  all_pheno_prs2_female %>% filter(!GOUT,
                                                  Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))

table1 <- tibble("Cohort" = cohortstring,
                 "N" = unlist(lapply(data_list2, function(x) nrow(x))),
                 "Age at Collection" = unlist(lapply(data_list2, function(x) missing(x$AGECOL))),
                 "Serum Urate" = unlist(lapply(data_list2, function(x) missing(x$URATE))),
                 "ULT" = unlist(lapply(data_list2, function(x) missing(x$ULT))),
                 "Age at Onset" = unlist(lapply(data_list2, function(x) missing(x$AGE1ATK))),
                 "Disease Duration" = unlist(lapply(data_list2, function(x) missing(x$DURATION))),
                 "Flares" = unlist(lapply(data_list2, function(x) missing(x$NUMATK))),
                 "Tophi" = unlist(lapply(data_list2, function(x) missing(x$TOPHIGOUT))),
                 "PRS - Imputed" = unlist(lapply(data_list2, function(x) missing(x$PRS1))),
                 "PRS - Direct" = unlist(lapply(data_list2, function(x) missing(x$PRS2))),
                 "Prophylaxis" = unlist(lapply(data_list2, function(x) missing(x$PROPHY))),
                 "BMI" = unlist(lapply(data_list2, function(x) missing(x$BMI))),
                 "Hypertension" = unlist(lapply(data_list2, function(x) missing(x$HYPERTENSION))),
                 "Type 2 Diabetes" = unlist(lapply(data_list2, function(x) missing(x$DIABETES))),
                 "Heart Disease" = unlist(lapply(data_list2, function(x) missing(x$HEART))),
                 "Kidney Disease" = unlist(lapply(data_list2, function(x) missing(x$KIDNEY))),
                 "Dyslipidemia" = unlist(lapply(data_list2, function(x) missing(x$LIPIDS))),
                 "Stroke" = unlist(lapply(data_list2, function(x) missing(x$STROKE))),
                 "Alcoholic Drinks / Week" = unlist(lapply(data_list2, function(x) missing(x$TOTALALC))),
                 "Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list2, function(x) missing(x$SUGDRINK))),
                 "Current Smoker" = unlist(lapply(data_list2, function(x) missing(x$CURSMOKE))),
                 "Family History of Gout" = unlist(lapply(data_list2, function(x) missing(x$FAMGOUT))),
                 "No. Relatives w/ Gout" = unlist(lapply(data_list2, function(x) missing(x$FAMGOUTNUM))))

table1 <- transpose_df(table1) %>% 
  column_to_rownames(var = "Cohort") %>% 
  mutate(across(.cols = 1:ncol(table1), ~ str_replace(string = .x, pattern = " ", replacement = "&nbsp;")))

row.names(table1) <- str_replace(row.names(table1), " ", "&nbsp;")

table1 %>% 
  kable(col.names = clean_names,
        align = "c",
        escape = F) %>% 
  kable_styling("striped") %>% 
  scroll_box(width = "900px", height = "475px") %>% 
  footnote("'All' = all missing, 'None' = none missing")
UK Biobank
Gout
Male
UK Biobank
Gout
Female
UK Biobank
Control
Male
UK Biobank
Control
Female
Aus/NZ European
Gout
Male
Aus/NZ European
Gout
Female
Aus/NZ European
Control
Male
Aus/NZ European
Control
Female
GlobalGout
Gout
Male
GlobalGout
Gout
Female
GlobalGout
Control
Male
GlobalGout
Control
Female
Ardea
LASSO
Gout
Male
Ardea
LASSO
Gout
Female
Ardea
CLEAR1
Gout
Male
Ardea
CLEAR1
Gout
Female
Ardea
CLEAR2
Gout
Male
Ardea
CLEAR2
Gout
Female
Ardea
CRYSTAL
Gout
Male
Ardea
CRYSTAL
Gout
Female
Ardea
LIGHT
Gout
Male
Ardea
LIGHT
Gout
Female
East Polynesian
Gout
Male
East Polynesian
Gout
Female
East Polynesian
Control
Male
East Polynesian
Control
Female
West Polynesian
Gout
Male
West Polynesian
Gout
Female
West Polynesian
Control
Male
West Polynesian
Control
Female
N 7094 851 149748 181186 928 195 714 593 1017 125 45 76 776 65 223 16 233 8 170 4 102 9 563 161 306 460 458 61 218 212
Age at Collection None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None
Serum Urate 337 (4.8) 54 (6.3) 7,064 (4.7) 8,684 (4.8) 19 (2.0) 5 (2.6) 25 (3.5) 14 (2.4) 49 (4.8) 8 (6.4) 30 (66.7) 69 (90.8) 4 (0.5) 1 (1.5) 1 (0.4) None 1 (0.4) None None None None None 2 (0.4) 2 (1.2) 36 (11.8) 71 (15.4) 1 (0.2) 1 (1.6) 23 (10.6) 17 (8.0)
ULT None None All All 393 (42.3) 92 (47.2) All All 274 (26.9) 41 (32.8) All All 3 (0.4) 1 (1.5) None None None None 77 (45.3) 3 (75.0) None None 150 (26.6) 38 (23.6) All All 128 (27.9) 15 (24.6) All All
Age at Onset All All All All 47 (5.1) 23 (11.8) All All 24 (2.4) 6 (4.8) All All None None None None None None None None None None 30 (5.3) 10 (6.2) All All 17 (3.7) 10 (16.4) All All
Disease Duration All All All All 47 (5.1) 23 (11.8) All All 24 (2.4) 6 (4.8) All All None None None None None None None None None None 30 (5.3) 10 (6.2) All All 17 (3.7) 10 (16.4) All All
Flares All All All All 119 (12.8) 34 (17.4) All All 72 (7.1) 6 (4.8) All All None None None None None None None None None None 38 (6.7) 18 (11.2) All All 20 (4.4) 9 (14.8) All All
Tophi All All All All 203 (21.9) 40 (20.5) All All 478 (47.0) 51 (40.8) All All None None 1 (0.4) None None None None None None None 114 (20.2) 38 (23.6) All All 37 (8.1) 6 (9.8) All All
PRS - Imputed None None None None None None None None None None None None None None None None None None None None None None All All All All All All All All
PRS - Direct None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None
Prophylaxis All All All All 871 (93.9) 182 (93.3) 713 (99.9) All 314 (30.9) 43 (34.4) All All None None None None None None None None None None 361 (64.1) 116 (72.0) 304 (99.3) All 406 (88.6) 49 (80.3) All 211 (99.5)
BMI 25 (0.4) 7 (0.8) 533 (0.4) 539 (0.3) 56 (6.0) 16 (8.2) 170 (23.8) 46 (7.8) 47 (4.6) 2 (1.6) All All 4 (0.5) None 1 (0.4) None None None None None None None 9 (1.6) 5 (3.1) 7 (2.3) 21 (4.6) 11 (2.4) 4 (6.6) 2 (0.9) 9 (4.2)
Hypertension None None None None 355 (38.3) 35 (17.9) 452 (63.3) 281 (47.4) 362 (35.6) 35 (28.0) All All None None 1 (0.4) None None None None None None None 153 (27.2) 20 (12.4) 218 (71.2) 305 (66.3) 201 (43.9) 15 (24.6) 171 (78.4) 158 (74.5)
Type 2 Diabetes 715 (10.1) 90 (10.6) 16,348 (10.9) 23,204 (12.8) 67 (7.2) 16 (8.2) 303 (42.4) 279 (47.0) 187 (18.4) 30 (24.0) All All None None None None None None None None None None 102 (18.1) 23 (14.3) 48 (15.7) 63 (13.7) 14 (3.1) None 7 (3.2) 10 (4.7)
Heart Disease None None None None 197 (21.2) 32 (16.4) 364 (51.0) 303 (51.1) 519 (51.0) 53 (42.4) All All None None 1 (0.4) None None None None None None None 95 (16.9) 18 (11.2) 60 (19.6) 95 (20.7) 34 (7.4) 4 (6.6) 50 (22.9) 17 (8.0)
Kidney Disease 325 (4.6) 44 (5.2) 6,886 (4.6) 8,457 (4.7) 212 (22.8) 40 (20.5) 342 (47.9) 224 (37.8) 514 (50.5) 45 (36.0) All All 4 (0.5) None 2 (0.9) None None None None None None None 179 (31.8) 46 (28.6) 248 (81.0) 375 (81.5) 73 (15.9) 10 (16.4) 177 (81.2) 163 (76.9)
Dyslipidemia None None None None 356 (38.4) 62 (31.8) 407 (57.0) 184 (31.0) 256 (25.2) 37 (29.6) All All None None None None None None 1 (0.6) None None None 199 (35.3) 47 (29.2) 208 (68.0) 330 (71.7) 151 (33.0) 16 (26.2) 156 (71.6) 133 (62.7)
Stroke None None None None 288 (31.0) 61 (31.3) 254 (35.6) 85 (14.3) 601 (59.1) 74 (59.2) All All None None None None None None None None None None 135 (24.0) 28 (17.4) 68 (22.2) 88 (19.1) 48 (10.5) 5 (8.2) 47 (21.6) 17 (8.0)
Alcoholic Drinks / Week 977 (13.8) 397 (46.7) 30,941 (20.7) 63,472 (35.0) None None 120 (16.8) 219 (36.9) 509 (50.0) 58 (46.4) All All All All All All All All All All All All 42 (7.5) 3 (1.9) 1 (0.3) 6 (1.3) 40 (8.7) 3 (4.9) 4 (1.8) 2 (0.9)
Sugar-Sweetened Drinks / Week All All All All 101 (10.9) 24 (12.3) 169 (23.7) 48 (8.1) 759 (74.6) 94 (75.2) All All All All All All All All All All All All 84 (14.9) 17 (10.6) 6 (2.0) 11 (2.4) 95 (20.7) 6 (9.8) 5 (2.3) 2 (0.9)
Current Smoker None None None None 437 (47.1) 117 (60.0) 242 (33.9) 127 (21.4) 464 (45.6) 48 (38.4) All All All All 2 (0.9) None None None None None 3 (2.9) None 280 (49.7) 81 (50.3) 137 (44.8) 232 (50.4) 320 (69.9) 25 (41.0) 97 (44.5) 75 (35.4)
Family History of Gout All All All All 73 (7.9) 19 (9.7) 394 (55.2) 304 (51.3) 243 (23.9) 18 (14.4) 33 (73.3) 69 (90.8) All All All All All All All All All All 74 (13.1) 18 (11.2) 49 (16.0) 67 (14.6) 61 (13.3) 4 (6.6) 32 (14.7) 30 (14.2)
No. Relatives w/ Gout All All All All 278 (30.0) 62 (31.8) 506 (70.9) 455 (76.7) 635 (62.4) 82 (65.6) All All All All All All All All All All All All 156 (27.7) 40 (24.8) 90 (29.4) 105 (22.8) 92 (20.1) 16 (26.2) 50 (22.9) 59 (27.8)
Note: ‘All’ = all missing, ‘None’ = none missing

So now the following is true about the final cohort:

  1. Age at collection and the directly genotyped PRS are both fully phenotyped

  2. The Imputed PRS is fully phenotyped for European cohorts

  3. Serum urate has around 5% missingness in the UK Biobank, 2.5% in Aus/NZ Europeans, 5% in GlobalGout gout cases, 80% in GlobalGout controls, < 1% in all Ardea cohorts, <1% in Polynesian gout cases, and around 10% in Polynesian controls

  4. ULT has around 30-40% missingness in most gout cohorts except 4 of the 5 Ardea cohorts and the UK Biobank gout cohort

  5. The three severity traits are completely available in all Ardea cohorts and completely missing in the UK Biobank. In the remaining cohorts, age at onset/disease duration has around 5 - 10% missingness, flares have around 5 - 15% missingness, and tophi have 10 - 20% missingness for Aus/NZ and around 45% missingness in GlobalGout

  6. Prophylaxis data is sparse for Aus/NZ Europeans and Polynesians, 30% missing for GlobalGout and well phenotyped in Ardea

  7. Comorbidity data is not well phenotyped in general and thus not very good for running models, but okay for descriptive stats

  8. Lifestyle factors are inconsistently phenotyped, but could be useful in sensitivity analyses

  9. Finally, family history data is not well phenotyped but still could be of some use for assessing whether the PRS strongly associates with it and whether it could be an easier way to assess genetics of gout than genotyping

Given the relatively low proportions of missingness for our most important variables, we are probably okay to just use complete cases only for each model. This will of course reduce power, and may introduce bias which I need to be okay with as a limitation.


Characteristics of each Cohort

table1 <- tibble("Cohort" = cohortstring, 
                 "N" = unlist(lapply(data_list2, nrow)),
                 "Age at Collection (years)" = unlist(lapply(data_list2, function(x) report(x$AGECOL))),
                 "Serum Urate (mg/dL)" = unlist(lapply(data_list2, function(x) report(x$URATE))),
                 "ULT" = unlist(lapply(data_list2, function(x) sumreport(x$ULT))),
                 "Age at Onset (years)" = unlist(lapply(data_list2, function(x) report(x$AGE1ATK))),
                 "Disease Duration (years)" = unlist(lapply(data_list2, function(x) report(x$DURATION))),
                 "Number of Flares in Last Year" = unlist(lapply(data_list2, function(x) report_median(x$NUMATK))),
                 "Presence of Tophi" = unlist(lapply(data_list2, function(x) sumreport(x$TOPHIGOUT))),
                 "PRS - Imputed" = unlist(lapply(data_list2, function(x) report(x$PRS1))),
                 "PRS - Direct" = unlist(lapply(data_list2, function(x) report(x$PRS2))),
                 "Prophylaxis" = unlist(lapply(data_list2, function(x) sumreport(x$PROPHY))),
                 "BMI" = unlist(lapply(data_list2, function(x) report(x$BMI))),
                 "Hypertension" = unlist(lapply(data_list2, function(x) sumreport(x$HYPERTENSION))),
                 "Type 2 Diabetes" = unlist(lapply(data_list2, function(x) sumreport(x$DIABETES))),
                 "Heart Disease" = unlist(lapply(data_list2, function(x) sumreport(x$HEART))),
                 "Kidney Disease" = unlist(lapply(data_list2, function(x) sumreport(x$KIDNEY))),
                 "Dyslipidemia" = unlist(lapply(data_list2, function(x) sumreport(x$LIPIDS))),
                 "Stroke" = unlist(lapply(data_list2, function(x) sumreport(x$STROKE))),
                 "Alcoholic Drinks / Week" = unlist(lapply(data_list2, function(x) report(x$TOTALALC))),
                 "Sugar-Sweetened Drinks / Week" = unlist(lapply(data_list2, function(x) report(x$SUGDRINK))),
                 "Current Smoker" = unlist(lapply(data_list2, function(x) sumreport(x$CURSMOKE))),
                 "Family History of Gout" = unlist(lapply(data_list2, function(x) sumreport(x$FAMGOUT))),
                 "No. Relatives w/ Gout" = unlist(lapply(data_list2, function(x) report(x$FAMGOUTNUM))))

table1 <- transpose_df(table1) %>% 
  column_to_rownames(var = "Cohort") %>% 
  mutate(across(.cols = 1:ncol(table1), ~ str_replace(string = .x, pattern = " ", replacement = "&nbsp;")))

row.names(table1) <- str_replace(row.names(table1), " ", "&nbsp;")

table1 %>% 
  kable(col.names = clean_names,
        align = "c",
        escape = F) %>% 
  kable_styling("striped") %>% 
  scroll_box(width = "900px", height = "475px") %>% 
  footnote("Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).")
UK Biobank
Gout
Male
UK Biobank
Gout
Female
UK Biobank
Control
Male
UK Biobank
Control
Female
Aus/NZ European
Gout
Male
Aus/NZ European
Gout
Female
Aus/NZ European
Control
Male
Aus/NZ European
Control
Female
GlobalGout
Gout
Male
GlobalGout
Gout
Female
GlobalGout
Control
Male
GlobalGout
Control
Female
Ardea
LASSO
Gout
Male
Ardea
LASSO
Gout
Female
Ardea
CLEAR1
Gout
Male
Ardea
CLEAR1
Gout
Female
Ardea
CLEAR2
Gout
Male
Ardea
CLEAR2
Gout
Female
Ardea
CRYSTAL
Gout
Male
Ardea
CRYSTAL
Gout
Female
Ardea
LIGHT
Gout
Male
Ardea
LIGHT
Gout
Female
East Polynesian
Gout
Male
East Polynesian
Gout
Female
East Polynesian
Control
Male
East Polynesian
Control
Female
West Polynesian
Gout
Male
West Polynesian
Gout
Female
West Polynesian
Control
Male
West Polynesian
Control
Female
N 7094 851 149748 181186 928 195 714 593 1017 125 45 76 776 65 223 16 233 8 170 4 102 9 563 161 306 460 458 61 218 212
Age at Collection (years) 60.0 ± 7.0 61.7 ± 6.1 57.0 ± 8.1 56.7 ± 7.9 62.7 ± 12.2 70.7 ± 12.4 55.9 ± 16.6 51.9 ± 17.2 60.1 ± 13.2 67.8 ± 10.8 57.6 ± 16.7 65.2 ± 11.0 51.3 ± 11.8 60.7 ± 10.6 52.4 ± 11.2 61.4 ± 7.4 53.1 ± 10.9 55.2 ± 14.8 54.4 ± 10.8 63.8 ± 5.4 54.1 ± 11.8 64.6 ± 15.1 55.3 ± 12.4 60.3 ± 12.1 45.3 ± 15.6 46.3 ± 15.2 47.5 ± 12.3 52.0 ± 14.4 39.6 ± 15.1 40.4 ± 15.2
Serum Urate (mg/dL) 6.7 ± 1.7 6.1 ± 2.0 5.9 ± 1.2 4.5 ± 1.1 6.7 ± 1.9 6.5 ± 2.4 5.5 ± 2.8 3.1 ± 2.6 7.4 ± 2.3 7.7 ± 2.7 7.0 ± 1.8 6.4 ± 1.7 8.9 ± 1.2 8.9 ± 1.4 7.8 ± 1.4 8.1 ± 1.2 7.9 ± 1.5 8.2 ± 2.0 8.8 ± 1.5 10.1 ± 1.2 9.3 ± 1.7 8.0 ± 1.4 7.0 ± 2.2 6.5 ± 2.5 6.5 ± 1.8 5.4 ± 1.6 7.7 ± 2.1 7.0 ± 2.8 6.7 ± 1.9 5.4 ± 1.7
ULT 4036 (56.9) 331 (38.9) NA NA 534 (99.8) 102 (99.0) NA NA 564 (75.9) 49 (58.3) NA NA 237 (30.7) 25 (39.1) 223 (100.0) 16 (100.0) 233 (100.0) 8 (100.0) 93 (100.0) 1 (100.0) 102 (100.0) 9 (100.0) 380 (92.0) 112 (91.1) NA NA 314 (95.2) 44 (95.7) NA NA
Age at Onset (years) NA NA NA NA 46.6 ± 15.7 60.6 ± 15.1 NA NA 46.4 ± 14.0 57.6 ± 12.5 NA NA 41.3 ± 13.4 55.5 ± 11.4 41.7 ± 12.3 55.2 ± 11.2 42.8 ± 13.3 47.6 ± 19.5 40.2 ± 13.3 61.5 ± 6.2 43.2 ± 13.3 52.0 ± 18.9 37.9 ± 14.2 48.8 ± 15.6 NA NA 34.3 ± 11.9 43.0 ± 15.7 NA NA
Disease Duration (years) NA NA NA NA 17.0 ± 12.8 10.4 ± 9.9 NA NA 14.6 ± 11.5 11.0 ± 10.4 NA NA 11.0 ± 9.5 6.2 ± 7.3 11.6 ± 9.7 7.1 ± 9.5 11.3 ± 10.0 8.6 ± 11.2 15.2 ± 10.3 3.2 ± 1.0 11.9 ± 9.0 13.6 ± 15.4 18.2 ± 13.5 13.2 ± 12.9 NA NA 13.9 ± 10.3 9.2 ± 8.7 NA NA
Number of Flares in Last Year NA NA NA NA 2 (0 - 4) 2 (0 - 4) NA NA 2 (1 - 4) 3 (1.5 - 4) NA NA 4 (3 - 8) 3 (3 - 6) 3 (2 - 6) 3 (3 - 4) 4 (2 - 6) 4.5 (2 - 6) 4 (3 - 6) 4.5 (2.25 - 6) 4 (2 - 9.5) 4 (3 - 5) 3 (1 - 6) 2 (0 - 5.5) NA NA 4 (2 - 10) 3 (1 - 6) NA NA
Presence of Tophi NA NA NA NA 310 (42.8) 64 (41.3) NA NA 314 (58.3) 47 (63.5) NA NA 135 (17.4) 5 (7.7) 33 (14.9) 1 (6.2) 51 (21.9) 3 (37.5) 169 (99.4) 4 (100.0) 26 (25.5) 5 (55.6) 163 (36.3) 30 (24.4) NA NA 192 (45.6) 16 (29.1) NA NA
PRS - Imputed 5.6 ± 0.7 5.5 ± 0.7 5.1 ± 0.7 5.1 ± 0.7 5.6 ± 0.7 5.5 ± 0.7 5.2 ± 0.7 5.2 ± 0.7 5.6 ± 0.7 5.5 ± 0.6 5.5 ± 0.7 5.3 ± 0.7 5.7 ± 0.7 5.6 ± 0.7 5.7 ± 0.7 5.8 ± 0.7 5.8 ± 0.6 6.0 ± 0.7 5.8 ± 0.6 5.7 ± 0.4 5.7 ± 0.7 5.8 ± 0.3 NA NA NA NA NA NA NA NA
PRS - Direct 4.2 ± 0.6 4.1 ± 0.6 3.8 ± 0.6 3.8 ± 0.6 4.2 ± 0.6 4.2 ± 0.6 3.9 ± 0.6 3.8 ± 0.6 4.2 ± 0.6 4.1 ± 0.6 4.1 ± 0.7 3.9 ± 0.6 4.3 ± 0.7 4.3 ± 0.6 4.4 ± 0.6 4.4 ± 0.6 4.4 ± 0.6 4.7 ± 0.8 4.4 ± 0.6 4.3 ± 0.5 4.3 ± 0.6 4.5 ± 0.3 4.5 ± 0.5 4.5 ± 0.5 4.4 ± 0.4 4.3 ± 0.5 5.0 ± 0.6 4.9 ± 0.6 4.5 ± 0.6 4.5 ± 0.5
Prophylaxis NA NA NA NA 54 (94.7) 13 (100.0) 0 (0.0) NA 440 (62.6) 57 (69.5) NA NA 769 (99.1) 65 (100.0) 223 (100.0) 16 (100.0) 233 (100.0) 8 (100.0) 170 (100.0) 4 (100.0) 102 (100.0) 9 (100.0) 189 (93.6) 41 (91.1) 2 (100.0) NA 52 (100.0) 10 (83.3) NA 0 (0.0)
BMI 30.4 ± 4.7 32.2 ± 6.6 27.8 ± 4.2 27.0 ± 5.1 30.2 ± 5.3 30.6 ± 7.3 27.0 ± 4.4 26.8 ± 6.1 29.4 ± 4.7 31.1 ± 6.8 NA NA 34.3 ± 6.7 38.2 ± 10.4 34.3 ± 6.3 38.1 ± 6.5 33.8 ± 6.1 36.2 ± 6.9 32.1 ± 5.4 36.5 ± 3.8 31.1 ± 4.9 35.7 ± 8.1 35.6 ± 7.9 38.3 ± 9.1 31.9 ± 7.0 32.5 ± 8.6 36.0 ± 6.7 38.8 ± 9.1 33.2 ± 6.2 34.3 ± 7.6
Hypertension 4969 (70.0) 671 (78.8) 59467 (39.7) 54894 (30.3) 553 (96.5) 158 (98.8) 156 (59.5) 123 (39.4) 650 (99.2) 90 (100.0) NA NA 379 (48.8) 48 (73.8) 139 (62.6) 15 (93.8) 165 (70.8) 7 (87.5) 98 (57.6) 4 (100.0) 54 (52.9) 9 (100.0) 384 (93.7) 140 (99.3) 87 (98.9) 145 (93.5) 224 (87.2) 43 (93.5) 41 (87.2) 50 (92.6)
Type 2 Diabetes 1275 (20.0) 186 (24.4) 11687 (8.8) 7637 (4.8) 137 (15.9) 50 (27.9) 49 (11.9) 42 (13.4) 343 (41.3) 53 (55.8) NA NA 74 (9.5) 16 (24.6) 30 (13.5) 6 (37.5) 32 (13.7) 1 (12.5) 22 (12.9) 2 (50.0) 12 (11.8) 0 (0.0) 170 (36.9) 74 (53.6) 66 (25.6) 84 (21.2) 85 (19.1) 29 (47.5) 37 (17.5) 53 (26.2)
Heart Disease 1922 (27.1) 249 (29.3) 20676 (13.8) 11150 (6.2) 309 (42.3) 81 (49.7) 81 (23.1) 36 (12.4) 155 (31.1) 41 (56.9) NA NA 37 (4.8) 3 (4.6) 17 (7.7) 0 (0.0) 24 (10.3) 0 (0.0) 18 (10.6) 0 (0.0) 5 (4.9) 1 (11.1) 190 (40.6) 82 (57.3) 58 (23.6) 57 (15.6) 78 (18.4) 20 (35.1) 12 (7.1) 20 (10.3)
Kidney Disease 968 (14.3) 236 (29.2) 4978 (3.5) 7973 (4.6) 346 (48.3) 113 (72.9) 207 (55.6) 267 (72.4) 226 (44.9) 64 (80.0) NA NA 131 (17.0) 28 (43.1) 29 (13.1) 9 (56.2) 37 (15.9) 4 (50.0) 31 (18.2) 2 (50.0) 14 (13.7) 5 (55.6) 192 (50.0) 88 (76.5) 39 (67.2) 67 (78.8) 139 (36.1) 36 (70.6) 22 (53.7) 35 (71.4)
Dyslipidemia 3642 (51.3) 443 (52.1) 42686 (28.5) 30790 (17.0) 470 (82.2) 110 (82.7) 159 (51.8) 141 (34.5) 556 (73.1) 72 (81.8) NA NA 319 (41.1) 36 (55.4) 108 (48.4) 13 (81.2) 96 (41.2) 4 (50.0) 74 (43.8) 3 (75.0) 40 (39.2) 6 (66.7) 327 (89.8) 107 (93.9) 83 (84.7) 107 (82.3) 255 (83.1) 41 (91.1) 47 (75.8) 47 (59.5)
Stroke 562 (7.9) 88 (10.3) 6089 (4.1) 4490 (2.5) 43 (6.7) 15 (11.2) 137 (29.8) 231 (45.5) 41 (9.9) 8 (15.7) NA NA 7 (0.9) 1 (1.5) 3 (1.3) 0 (0.0) 1 (0.4) 0 (0.0) 2 (1.2) 0 (0.0) 0 (0.0) 1 (11.1) 28 (6.5) 14 (10.5) 13 (5.5) 20 (5.4) 13 (3.2) 6 (10.7) 5 (2.9) 4 (2.1)
Alcoholic Drinks / Week 7.6 ± 5.4 6.0 ± 5.1 6.6 ± 5.2 5.8 ± 4.9 7.9 ± 10.6 2.3 ± 4.9 4.5 ± 9.0 2.8 ± 4.4 14.4 ± 19.4 4.0 ± 7.1 NA NA NA NA NA NA NA NA NA NA NA NA 5.5 ± 12.9 1.8 ± 6.5 5.0 ± 10.7 2.6 ± 6.0 4.0 ± 8.3 0.8 ± 2.6 4.0 ± 10.5 1.1 ± 3.6
Sugar-Sweetened Drinks / Week NA NA NA NA 1.0 ± 1.5 0.6 ± 1.0 0.9 ± 1.3 0.5 ± 1.1 0.8 ± 1.2 0.7 ± 1.2 NA NA NA NA NA NA NA NA NA NA NA NA 1.6 ± 1.9 1.0 ± 1.6 1.8 ± 2.4 1.2 ± 1.7 2.2 ± 2.2 1.5 ± 1.5 2.0 ± 1.8 1.3 ± 1.4
Current Smoker 440 (6.2) 70 (8.2) 13147 (8.8) 12319 (6.8) 23 (4.7) 4 (5.1) 20 (4.2) 19 (4.1) 92 (16.6) 13 (16.9) NA NA NA NA 38 (17.2) 0 (0.0) 26 (11.2) 0 (0.0) 30 (17.6) 0 (0.0) 15 (15.2) 0 (0.0) 55 (19.4) 9 (11.2) 47 (27.8) 53 (23.2) 15 (10.9) 2 (5.6) 24 (19.8) 20 (14.6)
Family History of Gout NA NA NA NA 379 (44.3) 80 (45.5) 49 (15.3) 71 (24.6) 266 (34.4) 44 (41.1) 6 (50.0) 2 (28.6) NA NA NA NA NA NA NA NA NA NA 335 (68.5) 111 (77.6) 108 (42.0) 193 (49.1) 252 (63.5) 36 (63.2) 73 (39.2) 73 (40.1)
No. Relatives w/ Gout NA NA NA NA 0.9 ± 1.0 0.9 ± 1.2 0.2 ± 0.5 0.5 ± 0.7 0.7 ± 0.9 0.9 ± 0.8 NA NA NA NA NA NA NA NA NA NA NA NA 1.8 ± 2.2 1.9 ± 1.6 0.8 ± 1.3 0.9 ± 1.2 1.6 ± 2.1 1.5 ± 1.8 0.6 ± 0.9 0.6 ± 0.9
Note: Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).

From this table we learn the following:

  1. In general, gout cohorts consist of approximately 80 - 90% males, while control cohorts are closer to 50% male.

  2. European gout cohorts are approximately 55 - 60 years old on average, while Polynesian gout cohorts are around 50 - 55 years old. European control cohorts are around 55 years old, while Polynesian controls are around 40 - 45 years old. Female gout cases are on average older than male gout cases.

  3. Serum urate levels for gout cohorts tend to be between 6.5 and 9 mg/dL, while control cohorts sit around 4 to 6 mg/dL on average. Females in general have lower serum urate than males.

  4. Most gout cohorts have high rates of ULT (many are over 90% on ULT). Similar ULT usage is seen in males and females, though perhaps slightly less females are prescribed ULT than males.

  5. Mean onset is around 40 - 50 for European cohorts and around 35 - 40 for Polynesians. Average disease duration is around 10 - 20 years for all cohorts. All females have a much higher age at onset compared to males and a shorter disease duration.

  6. Gout cohorts have up to 10 flares per year on average. No obvious difference in flares between males and females, perhaps slightly lower in Polynesian females vs Polynesian males.

  7. Gout cohorts tend to have between 15 and 50% tophaceous gout. Inconsistent decrease in number of tophi in females vs males.

  8. The imputed PRS sits around 5.5 for most gout cohorts, and around 5.1 for control cohorts. The directly genotyped PRS sits around 4.2 - 4.4 for European gout cohorts, around 3.8 - 3.9 for European control cohorts, around 4.5 - 4.9 for Polynesian gout cohorts and around 4.5 for Polynesian control cohorts. Female gout cases may have a slightly lower PRS than males, but this is probably not real.

  9. For cohorts with prophylaxis data, almost all gout cases are on prophylaxis. No obvious sex differences.

  10. European gout cases have a mean BMI of between 30 and 35 on average, while European controls are closer to 27 on average. Polynesian cases have a mean BMI of 36 while controls have around 33 on average. Female gout cases consistently have higher BMI and higher rates of all comorbidities than their male counterparts, while female controls have lower BMI and lower rates of all comorbidities than male controls.

  11. Gout cohorts are around 50 - 100% hypertensive, while European control cohorts are around 35 - 50% and Polynesian control cohorts are 90 - 95% hypertensive

  12. Type 2 diabetes seems to be more common in gout cohorts, at between 10 and 40%, while it sits at around 7 - 13% for European controls, and around 23% for Polynesian controls

  13. Heart disease is around 20 - 45% frequency in gout cohorts, though Ardea cohorts all show lower amounts at around 5 - 10% each. The control cohorts sit around 10 - 20% each

  14. Kidney disease is relatively common among the Aus/NZ cohorts and the GlobalGout cohort, but not as common among the Ardea cohorts or UK Biobank. This may be because mainly the people involved in renal disease studies had kidney disease measurements

  15. Dyslipidemia clearly associates with gout status, sitting at around 20 to 42% in European control cohorts vs 50 to 80% in European gout cohorts. Ardea cohorts have closer to 40 to 50% dyslipidemia however. The Polynesian cohorts appear to have very high proportions of dyslipidemia, particularly in the gout cohorts.

  16. Stroke proportions are mostly low, except for the Aus/NZ control cohort who have very high proportions of this disease, and in general stroke associates with gout status.

  17. Alcoholic drinks do associate with gout status, and generally Polynesians drink less than Europeans. Males tend to drink much more than females, regardless of gout status, but male gout vs controls have higher rates of drinking while females are no different between cases and controls.

  18. Sugar sweetened drinks also seem to associate slightly with gout status, and Polynesians drink more of these than Europeans. Sugar-sweetened bevarages seems to be gout associated in all Europeans and West Polynesians but not East Polynesians. Also females tend to drink less of these than men.

  19. Smoking status shows no correlation with gout status but does seem to be higher rates in GlobalGout, Ardea, and Polynesian cohorts. Smoking status may be positively gout associated in females but negatively associated in males (European specific).

  20. Family history of gout is clearly correlated with gout status, and is much higher in Polynesians in general, which is also reflected by mean no. relatives with gout. As expected, family history of gout associates with gout status independent of sex.

I would be inclined to remove any traits that are not well phenotyped across our cohorts of interest, given that they seem to be highly biased towards specific studies that are interested in that trait, leading to much higher proportions with these traits than seems likely. Basically, I should figure out a cutoff that seems reasonable for excluding a variable. I could also look to check whether the variables are consistently phenotyped in all sub-cohorts.


Plotting distribution of each variable

# Plotting distributions of each of these variables in males and females separately, colored by cohort
all_cohorts <- all_pheno_prs2 %>% 
  mutate(SEX = factor(SEX, levels = c("Male", "Female")),
         GOUT2 = factor(GOUT, levels = c(TRUE, FALSE), labels = c("Gout", "Control")),
         GROUP = factor(case_when(GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Control",
                                  GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian Control",
                                  GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Control"), 
                        levels = c("European Gout", "European Control", "East Polynesian Gout", "East Polynesian Control", "West Polynesian Gout", "West Polynesian Control")),
         GROUP2 = factor(case_when(GROUP == "European Gout" & SEX == "Male" ~ "European Gout - male",
                                  GROUP == "European Gout" & SEX == "Female" ~ "European Gout - female",
                                  GROUP == "European Control" & SEX == "Male" ~ "European Control - male",
                                  GROUP == "European Control" & SEX == "Female" ~ "European Control - female",
                                  GROUP == "East Polynesian Gout" & SEX == "Male" ~ "East Polynesian Gout - male",
                                  GROUP == "East Polynesian Gout" & SEX == "Female" ~ "East Polynesian Gout - female",
                                  GROUP == "East Polynesian Control" & SEX == "Male" ~ "East Polynesian Control - male",
                                  GROUP == "East Polynesian Control" & SEX == "Female" ~ "East Polynesian Control - female",
                                  GROUP == "West Polynesian Gout" & SEX == "Male" ~ "West Polynesian Gout - male",
                                  GROUP == "West Polynesian Gout" & SEX == "Female" ~ "West Polynesian Gout - female",
                                  GROUP == "West Polynesian Control" & SEX == "Male" ~ "West Polynesian Control - male",
                                  GROUP == "West Polynesian Control" & SEX == "Female" ~ "West Polynesian Control - female"), 
                        levels = c("European Gout - male", "European Gout - female", "European Control - male", "European Control - female", "East Polynesian Gout - male", "East Polynesian Gout - female", "East Polynesian Control - male", "East Polynesian Control - female", "West Polynesian Gout - male", "West Polynesian Gout - female", "West Polynesian Control - male", "West Polynesian Control - female")),
         GROUP3 = factor(case_when(GOUT & SEX == "Male" ~ "Male Gout",
                                   GOUT & SEX == "Female" ~ "Female Gout",
                                   !GOUT & SEX == "Male" ~ "Male Control",
                                   !GOUT & SEX == "Female" ~ "Female Control"), 
                         levels = c("Male Gout", "Female Gout", "Male Control", "Female Control")),
         COHORT2 = factor(case_when(GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Gout",
                                    !GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Control",
                                    GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Control",
                                    GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Gout",
                                    !GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Control",
                                    Pheno.Study == "Ardea: 401" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LASSO",
                                    Pheno.Study == "Ardea: CLEAR1" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR1",
                                    Pheno.Study == "Ardea: CLEAR2" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR2",
                                    Pheno.Study == "Ardea: CRYSTAL" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CRYSTAL",
                                    Pheno.Study == "Ardea: LIGHT" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LIGHT",
                                    GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian - Control",
                                    GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Control"), 
                          levels = c("UK Biobank - Gout", "UK Biobank - Control", "Aus/NZ - Gout", "Aus/NZ - Control", "GlobalGout - Gout", "GlobalGout - Control", "Ardea - LASSO", "Ardea - CLEAR1", "Ardea - CLEAR2", "Ardea - CRYSTAL", "Ardea - LIGHT", "East Polynesian - Gout", "East Polynesian - Control", "West Polynesian - Gout", "West Polynesian - Control"))) %>% 
  filter(!is.na(COHORT2))

Age at collection

# Age at collection
all_cohorts %>% 
  ggplot(aes(x = AGECOL, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Age at Collection (years)") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_x_continuous(limits = c(0, 100)) + 
    scale_y_discrete(limits = rev(levels(all_cohorts$COHORT2)))

Serum urate

# Serum urate
all_cohorts %>% 
  filter(!is.na(URATE)) %>% 
  ggplot(aes(x = URATE, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Serum Urate (mg/dL)") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_x_continuous(limits = c(0, max(all_cohorts$URATE, na.rm = T))) + 
    scale_y_discrete(limits = rev(levels(all_cohorts$COHORT2)))

Urate-lowering therapy

# ULT
all_cohorts %>% 
  filter(GOUT) %>% 
  mutate(ULT = factor(case_when(ULT ~ "On ULT",
                                !ULT ~ "Not on ULT",
                                is.na(ULT) ~ "No Data"), 
                      levels = c("No Data", "Not on ULT", "On ULT"))) %>% 
  group_by(COHORT2, ULT, SEX) %>% 
  summarize(value = n()) %>% 
  ggplot(aes(x = COHORT2, y = value, fill = ULT, color = COHORT2)) +
    geom_bar(position = "fill", stat = "identity") +
    facet_wrap(~ SEX) +
    scale_fill_discrete(type = c("black", "#C0C0C0", "#505050"), limits = c("On ULT", "Not on ULT", "No Data")) +
    theme(axis.title.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank())
## `summarise()` has grouped output by 'COHORT2', 'ULT'. You can override using the `.groups` argument.

Age at onset

# Age at onset
all_cohorts %>% 
  filter(GOUT, !is.na(AGE1ATK)) %>% 
  ggplot(aes(x = AGE1ATK, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Age at Onset (years)") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_x_continuous(limits = c(0, 100)) + 
    scale_y_discrete(limits = rev(levels(all_cohorts %>% filter(GOUT, !is.na(AGE1ATK)) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2))))

Disease duration

# Disease duration
all_cohorts %>% 
  filter(GOUT, !is.na(DURATION)) %>% 
  ggplot(aes(x = DURATION, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Disease Duration (years)") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_x_continuous(limits = c(0, max(all_cohorts$DURATION, na.rm = T))) + 
    scale_y_discrete(limits = rev(levels(all_cohorts %>% filter(GOUT, !is.na(DURATION)) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2))))
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

Flare frequency

# Flare frequency
all_cohorts %>% 
  filter(GOUT, !is.na(NUMATK)) %>% 
  mutate(NUMATK = case_when(NUMATK > 52 ~ 52,
                            TRUE ~ NUMATK)) %>% 
  ggplot(aes(x = NUMATK, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Number of Flares in Last Year") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_x_continuous(limits = c(0, 52)) + 
    scale_y_discrete(limits = rev(levels(all_cohorts %>% filter(GOUT, !is.na(NUMATK)) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2))))

Flare frequency (categorical)

all_cohorts %>% 
  filter(GOUT) %>% 
  mutate(FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
                                      TRUE ~ as.character(FLARE_CAT)), 
                            levels = rev(c(paste0(0:5), 
                                       "6 - 11",
                                       "12 - 52",
                                       "No Data")),
                            ordered = TRUE)) %>% 
  group_by(COHORT2, FLARE_CAT, SEX) %>% 
  summarize(value = n()) %>% 
  ggplot(aes(x = COHORT2, y = value, fill = FLARE_CAT, color = COHORT2)) +
    geom_bar(position = "fill", stat = "identity") +
    facet_wrap(~ SEX) +
    scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
    theme(axis.title.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank())
## `summarise()` has grouped output by 'COHORT2', 'FLARE_CAT'. You can override using the `.groups` argument.

Tophi

# Tophi
all_cohorts %>% 
  filter(GOUT) %>% 
  mutate(TOPHIGOUT = factor(case_when(TOPHIGOUT ~ "Tophi",
                                !TOPHIGOUT ~ "No Tophi",
                                is.na(TOPHIGOUT) ~ "No Data"), 
                      levels = c("No Data", "No Tophi", "Tophi"))) %>% 
  group_by(COHORT2, TOPHIGOUT, SEX) %>% 
  summarize(value = n()) %>% 
  ggplot(aes(x = COHORT2, y = value, fill = TOPHIGOUT, color = COHORT2)) +
    geom_bar(position = "fill", stat = "identity") +
    facet_wrap(~ SEX) +
    scale_fill_discrete(type = c("black", "#C0C0C0", "#505050"), limits = c("Tophi", "No Tophi", "No Data")) +
    theme(axis.title.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank())
## `summarise()` has grouped output by 'COHORT2', 'TOPHIGOUT'. You can override using the `.groups` argument.

Imputed PRS

# PRS - imputed
all_cohorts %>% 
  filter(!str_detect(COHORT2, "Polynesian")) %>% 
  ggplot(aes(x = PRS1, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Gout PRS - Imputed") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_y_discrete(limits = rev(levels(all_cohorts %>% filter(!str_detect(COHORT2, "Polynesian")) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2))))

Genotyped PRS

# PRS - direct
all_cohorts %>% 
  ggplot(aes(x = PRS2, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Gout PRS - Directly Genotyped") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_y_discrete(limits = rev(levels(all_cohorts$COHORT2)))


Plotting relationship between age/sex and each outcome of interest

Gout

# Plotting relationship between covariates (age + sex) and each outcome of interest (GOUT, SU, Onset, Duration, Flares, Tophi)
# Gout
all_cohorts %>%
  filter(!is.na(GROUP)) %>% 
  mutate(GROUP = factor(case_when(str_detect(GROUP, "European") ~ "European",
                           str_detect(GROUP, "East") ~ "East Polynesian",
                           str_detect(GROUP, "West") ~ "West Polynesian"),
                        levels = c("European", "East Polynesian", "West Polynesian"))) %>% 
  ggplot(mapping = aes(x = GOUT2, y = AGECOL, fill = GROUP)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    facet_wrap(~ SEX) +
    labs(y = "Mean Age at Collection (years)") +
    #scale_fill_discrete(limits = c("Gout", "Control")) +
    theme(axis.title.x = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            legend.title = element_blank())

Serum urate v Age

# Serum urate
all_cohorts %>% 
  filter(!is.na(URATE)) %>% 
  mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
                                 ULT ~ "On ULT",
                                 is.na(ULT) ~ "No Data / Control"),
                       levels = c("On ULT", "Not on ULT", "No Data / Control"))) %>% 
  ggplot(mapping = aes(x = AGECOL, y = URATE, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX * ULT2) +
    labs(x = "Mean Age at Collection (years)", 
         y = "Serum Urate (mg/dL)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Serum urate v Duration

all_cohorts %>% 
  filter(!is.na(URATE)) %>% 
  mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
                                 ULT ~ "On ULT",
                                 is.na(ULT) ~ "No Data / Control"),
                       levels = c("On ULT", "Not on ULT", "No Data / Control"))) %>% 
  ggplot(mapping = aes(x = DURATION, y = URATE, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX * ULT2) +
    labs(x = "Disease Duration (years)", 
         y = "Serum Urate (mg/dL)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 325243 rows containing non-finite values (stat_smooth).

ULT

# ULT
all_cohorts %>% 
  filter(GOUT) %>% 
  mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
                          ULT ~ "On ULT",
                          is.na(ULT) ~ "No Data"),
                       levels = c("On ULT", "Not on ULT", "No Data"))) %>% 
  ggplot(mapping = aes(x = ULT2, y = AGECOL, fill = GROUP)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    facet_wrap(~ SEX) +
    labs(y = "Mean Age at Collection (years)") +
    theme(axis.title.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          legend.title = element_blank())

Onset

# Age at onset
all_cohorts %>% 
  filter(!is.na(AGE1ATK)) %>% 
  ggplot(mapping = aes(x = AGECOL, y = AGE1ATK, color = COHORT2)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Age at Collection (years)", y = "Age at Onset (years)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Duration

# Disease duration
all_cohorts %>% 
  filter(GOUT, !is.na(AGE1ATK)) %>% 
  ggplot(mapping = aes(x = AGECOL, y = DURATION, color = COHORT2)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Age at Collection (years)", y = "Disease Duration (years)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Flare frequency (all)

# Flare frequency
all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         !str_detect(COHORT2, "Ardea")) %>% 
  ggplot(aes(x = FLARE_CAT, y = AGECOL, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Collection (years)") +
    theme(legend.title = element_blank())

Flare frequency (> 1)

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         NUMATK >= 2) %>% 
  ggplot(aes(x = FLARE_CAT, y = AGECOL, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Collection (years)") +
    theme(legend.title = element_blank())

Tophi

# Tophi
all_cohorts %>% 
  filter(GOUT,
         !str_detect(COHORT2, "CRYSTAL")) %>% 
  mutate(TOPHIGOUT = factor(case_when(TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi",
                                      is.na(TOPHIGOUT) ~ "No Data"), 
                            levels = c("Tophi", "No Tophi", "No Data"))) %>% 
  ggplot(mapping = aes(x = TOPHIGOUT, y = AGECOL, fill = GROUP)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    facet_wrap(~ SEX) +
    labs(y = "Mean Age at Collection (years)") +
    theme(axis.title.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          legend.title = element_blank())


Plotting relationship between both PRSs and each outcome of interest

Gout vs Imputed PRS

# Gout
all_cohorts %>%
  filter(!is.na(GROUP),
         str_detect(GROUP, "European")) %>% 
  ggplot(aes(x = COHORT2, y = PRS1, fill = GOUT2)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    facet_wrap(~ SEX) +
    labs(y = "Gout PRS - Imputed") +
    scale_x_discrete(limits = rev) +
    scale_fill_discrete(limits = c("Gout", "Control")) +
    theme(axis.title.y = element_blank(),
            panel.grid.major.y = element_blank(),
            panel.grid.minor.y = element_blank(),
            legend.title = element_blank()) +
    coord_flip()

Gout vs Genotyped PRS

all_cohorts %>%
  filter(!is.na(GROUP)) %>% 
  mutate(GROUP = factor(case_when(str_detect(GROUP, "European") ~ "European",
                           str_detect(GROUP, "East") ~ "East Polynesian",
                           str_detect(GROUP, "West") ~ "West Polynesian"),
                        levels = c("European", "East Polynesian", "West Polynesian"))) %>% 
  ggplot(aes(x = GROUP, y = PRS2, fill = GOUT2)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    facet_wrap(~ SEX) +
    labs(y = "Gout PRS - Directly Genotyped") +
    scale_fill_discrete(limits = c("Gout", "Control")) +
    theme(axis.title.x = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            legend.title = element_blank())

Serum urate vs Imputed PRS

# Serum urate
all_cohorts %>% 
  filter(!is.na(URATE)) %>% 
  mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
                                 ULT ~ "On ULT",
                                 is.na(ULT) ~ "No Data / Control"),
                       levels = c("On ULT", "Not on ULT", "No Data / Control"))) %>% 
  ggplot(mapping = aes(x = PRS1, y = URATE, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX * ULT2) +
    labs(x = "Gout PRS - Imputed", 
         y = "Serum Urate (mg/dL)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2286 rows containing non-finite values (stat_smooth).

Serum urate vs Genotyped PRS

all_cohorts %>% 
  filter(!is.na(URATE)) %>% 
  mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
                                 ULT ~ "On ULT",
                                 is.na(ULT) ~ "No Data / Control"),
                       levels = c("On ULT", "Not on ULT", "No Data / Control"))) %>% 
  ggplot(mapping = aes(x = PRS2, y = URATE, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX * ULT2) +
    labs(x = "Gout PRS - Directly Genotyped", 
         y = "Serum Urate (mg/dL)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

ULT vs Imputed PRS

# ULT
all_cohorts %>% 
  filter(GOUT) %>% 
  mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
                          ULT ~ "On ULT",
                          is.na(ULT) ~ "No Data"),
                       levels = c("On ULT", "Not on ULT", "No Data"))) %>% 
  ggplot(mapping = aes(x = ULT2, y = PRS1, fill = SEX)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    labs(y = "Gout PRS - Imputed") +
    theme(axis.title.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          legend.title = element_blank())
## Warning: Removed 1243 rows containing non-finite values (stat_summary).

## Warning: Removed 1243 rows containing non-finite values (stat_summary).

ULT vs Genotyped PRS

all_cohorts %>% 
  filter(GOUT) %>% 
  mutate(ULT2 = factor(case_when(!ULT ~ "Not on ULT",
                          ULT ~ "On ULT",
                          is.na(ULT) ~ "No Data"),
                       levels = c("On ULT", "Not on ULT", "No Data"))) %>% 
  ggplot(mapping = aes(x = ULT2, y = PRS2, fill = GROUP)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    facet_wrap(~ SEX) +
    labs(y = "Gout PRS - Directly Genotyped") +
    theme(axis.title.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          legend.title = element_blank())

Onset vs Imputed PRS

# Age at onset
all_cohorts %>% 
  filter(!is.na(AGE1ATK)) %>% 
  ggplot(mapping = aes(x = PRS1, y = AGE1ATK, color = COHORT2)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Gout PRS - Imputed", y = "Age at Onset (years)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1176 rows containing non-finite values (stat_smooth).

Onset vs Genotyped PRS

all_cohorts %>% 
  filter(!is.na(AGE1ATK)) %>% 
  ggplot(mapping = aes(x = PRS2, y = AGE1ATK, color = COHORT2)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Gout PRS - Directly Genotyped", y = "Age at Onset (years)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Duration vs Imputed PRS

# Disease duration
all_cohorts %>% 
  filter(!is.na(DURATION)) %>% 
  ggplot(mapping = aes(x = PRS1, y = DURATION, color = COHORT2)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Gout PRS - Imputed", y = "Disease Duration (years)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1176 rows containing non-finite values (stat_smooth).

Duration vs Genotyped PRS

all_cohorts %>% 
  filter(!is.na(DURATION)) %>% 
  ggplot(mapping = aes(x = PRS2, y = DURATION, color = COHORT2)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Gout PRS - Directly Genotyped", y = "Disease Duration (years)") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Flares (all) vs Imputed PRS

# Flare frequency
all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         !str_detect(COHORT2, "Ardea")) %>% 
  ggplot(aes(x = FLARE_CAT, y = PRS1, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Gout PRS - Imputed") +
    theme(legend.title = element_blank())
## Warning: Removed 1140 rows containing non-finite values (stat_summary).

## Warning: Removed 1140 rows containing non-finite values (stat_summary).

## Warning: Removed 1140 rows containing non-finite values (stat_summary).

Flares (> 1) vs Imputed PRS

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         NUMATK >= 2) %>% 
  ggplot(aes(x = FLARE_CAT, y = PRS1, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Gout PRS - Imputed") +
    theme(legend.title = element_blank())
## Warning: Removed 815 rows containing non-finite values (stat_summary).

## Warning: Removed 815 rows containing non-finite values (stat_summary).

## Warning: Removed 815 rows containing non-finite values (stat_summary).

Flares (all) vs Genotyped PRS

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         !str_detect(COHORT2, "Ardea")) %>% 
  ggplot(aes(x = FLARE_CAT, y = PRS2, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Gout PRS - Directly Genotyped") +
    theme(legend.title = element_blank())

Flares (> 1) vs Genotyped PRS

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         NUMATK >= 2) %>% 
  ggplot(aes(x = FLARE_CAT, y = PRS2, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Gout PRS - Directly Genotyped") +
    theme(legend.title = element_blank())

Tophi vs Imputed PRS

# Tophi
all_cohorts %>% 
  filter(GOUT,
         !str_detect(COHORT2, "CRYSTAL")) %>% 
  mutate(TOPHIGOUT = factor(case_when(TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi",
                                      is.na(TOPHIGOUT) ~ "No Data"), 
                            levels = c("Tophi", "No Tophi", "No Data"))) %>% 
  ggplot(mapping = aes(x = TOPHIGOUT, y = PRS1, fill = SEX)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    labs(y = "Gout PRS - Imputed") +
    theme(axis.title.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          legend.title = element_blank())
## Warning: Removed 1243 rows containing non-finite values (stat_summary).

## Warning: Removed 1243 rows containing non-finite values (stat_summary).

Tophi vs Genotyped PRS

all_cohorts %>% 
  filter(GOUT,
         !str_detect(COHORT2, "CRYSTAL")) %>% 
  mutate(TOPHIGOUT = factor(case_when(TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi",
                                      is.na(TOPHIGOUT) ~ "No Data"), 
                            levels = c("Tophi", "No Tophi", "No Data"))) %>% 
  ggplot(mapping = aes(x = TOPHIGOUT, y = PRS2, fill = GROUP)) +
    stat_summary(fun = mean, geom = "bar", position = position_dodge()) +
    stat_summary(fun.data = mean_se, geom = "errorbar", position = position_dodge(width = 0.9), width = 0.3, alpha = 0.75) +
    facet_wrap(~ SEX) +
    labs(y = "Gout PRS - Directly Genotyped") +
    theme(axis.title.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          legend.title = element_blank())


Plotting relationship between both PRSs and age/sex

Age vs Imputed PRS

# Plotting relationship between each PRS and covariates (age + sex)
all_cohorts %>% 
  ggplot(mapping = aes(x = AGECOL, y = PRS1, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Mean Age at Collection (years)", 
         y = "Gout PRS - Imputed") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2439 rows containing non-finite values (stat_smooth).

Age vs Genotyped PRS

all_cohorts %>% 
  ggplot(mapping = aes(x = AGECOL, y = PRS2, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Mean Age at Collection (years)", 
         y = "Gout PRS - Directly Genotyped") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'


Plotting relationship between severity traits

Onset vs Tophi

# Plotting relationships between severity traits
all_cohorts %>% 
  filter(!is.na(AGE1ATK),
         !str_detect(COHORT2, "CRYSTAL")) %>% 
  mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
                                      TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi"),
                            levels = c("Tophi", "No Tophi", "No Data"))) %>% 
  ggplot(aes(y = AGE1ATK, x = TOPHIGOUT, color = GROUP)) +
  geom_boxplot() +
  facet_wrap(~ SEX) +
  labs(y = "Age at Onset (years)") +
  theme(axis.title.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        legend.title = element_blank())

Duration vs Tophi

all_cohorts %>% 
  filter(!is.na(DURATION),
         !str_detect(COHORT2, "CRYSTAL")) %>% 
  mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
                                      TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi"),
                            levels = c("Tophi", "No Tophi", "No Data"))) %>% 
  ggplot(aes(y = DURATION, x = TOPHIGOUT, color = GROUP)) +
  geom_boxplot() +
  facet_wrap(~ SEX) +
  labs(y = "Disease Duration (years)") +
  theme(axis.title.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        legend.title = element_blank())

Flares (all) vs Tophi

all_cohorts %>% 
  filter(!is.na(NUMATK),
         !str_detect(COHORT2, "CRYSTAL"),
         !str_detect(COHORT2, "Ardea")) %>% 
  mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
                                      TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi"),
                            levels = c("Tophi", "No Tophi", "No Data")),
         NUMATK = case_when(NUMATK > 52 ~ 52,
                            TRUE ~ NUMATK)) %>% 
  ggplot(aes(y = NUMATK, x = TOPHIGOUT, color = GROUP)) +
  geom_boxplot() +
  facet_wrap(~ SEX) +
  labs(y = "Number of Flares in Last Year") +
  theme(axis.title.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        legend.title = element_blank())

Flares (> 1) vs Tophi

all_cohorts %>% 
  filter(!is.na(NUMATK),
         !str_detect(COHORT2, "CRYSTAL"),
         NUMATK >= 2) %>% 
  mutate(TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
                                      TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi"),
                            levels = c("Tophi", "No Tophi", "No Data")),
         NUMATK = case_when(NUMATK > 52 ~ 52,
                            TRUE ~ NUMATK)) %>% 
  ggplot(aes(y = NUMATK, x = TOPHIGOUT, color = GROUP)) +
  geom_boxplot() +
  facet_wrap(~ SEX) +
  labs(y = "Number of Flares in Last Year") +
  theme(axis.title.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        legend.title = element_blank())

Flares (all - cat) vs Tophi

all_cohorts %>% 
  filter(GOUT,
         !str_detect(COHORT2, "CRYSTAL"),
         !str_detect(COHORT2, "Ardea")) %>% 
  mutate(FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
                                      TRUE ~ as.character(FLARE_CAT)), 
                            levels = rev(c(paste0(0:5), 
                                       "6 - 11",
                                       "12 - 52",
                                       "No Data")),
                            ordered = TRUE),
         TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
                                      TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi"),
                            levels = c("Tophi", "No Tophi", "No Data"))) %>% 
  group_by(GROUP, TOPHIGOUT, FLARE_CAT, SEX) %>% 
  summarize(value = n()) %>% 
  ggplot(aes(fill = FLARE_CAT, y = value, x = TOPHIGOUT, color = TOPHIGOUT)) +
  geom_bar(position = "fill", stat = "identity") +
  facet_wrap(~ SEX * GROUP) +
  scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
  theme(axis.title.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank())
## `summarise()` has grouped output by 'GROUP', 'TOPHIGOUT', 'FLARE_CAT'. You can override using the `.groups` argument.

Flares (> 1 - cat) vs Tophi

all_cohorts %>% 
  filter(GOUT,
         !str_detect(COHORT2, "CRYSTAL"),
         NUMATK >= 2) %>% 
  mutate(FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
                                      TRUE ~ as.character(FLARE_CAT)), 
                            levels = rev(c(paste0(0:5), 
                                       "6 - 11",
                                       "12 - 52",
                                       "No Data")),
                            ordered = TRUE),
         TOPHIGOUT = factor(case_when(is.na(TOPHIGOUT) ~ "No Data",
                                      TOPHIGOUT ~ "Tophi",
                                      !TOPHIGOUT ~ "No Tophi"),
                            levels = c("Tophi", "No Tophi", "No Data"))) %>% 
  group_by(GROUP, TOPHIGOUT, FLARE_CAT, SEX) %>% 
  summarize(value = n()) %>% 
  ggplot(aes(fill = FLARE_CAT, y = value, x = TOPHIGOUT, color = TOPHIGOUT)) +
  geom_bar(position = "fill", stat = "identity") +
  facet_wrap(~ SEX * GROUP) +
  scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
  theme(axis.title.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.x = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank())
## `summarise()` has grouped output by 'GROUP', 'TOPHIGOUT', 'FLARE_CAT'. You can override using the `.groups` argument.

Onset vs Flares (all)

all_cohorts %>% 
  filter(!is.na(AGE1ATK),
         !is.na(NUMATK),
         !str_detect(COHORT2, "Ardea")) %>% 
  ggplot(aes(x = AGE1ATK, y = NUMATK, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Age at Onset (years)", 
         y = "Number of Flares in Last Year") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Onset vs Flares (> 1)

all_cohorts %>% 
  filter(!is.na(AGE1ATK),
         !is.na(NUMATK),
         NUMATK >= 2) %>% 
  ggplot(aes(x = AGE1ATK, y = NUMATK, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Age at Onset (years)", 
         y = "Number of Flares in Last Year") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Duration vs Flares (all)

all_cohorts %>% 
  filter(!is.na(DURATION),
         !is.na(NUMATK),
         !str_detect(COHORT2, "Ardea")) %>% 
  ggplot(aes(x = DURATION, y = NUMATK, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Disease Duration (years)", 
         y = "Number of Flares in Last Year") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Duration vs Flares (> 1)

all_cohorts %>% 
  filter(!is.na(DURATION),
         !is.na(NUMATK),
         NUMATK >= 2) %>% 
  ggplot(aes(x = DURATION, y = NUMATK, color = GROUP)) +
    geom_smooth(method = "lm", se = F) +
    facet_wrap(~ SEX) +
    labs(x = "Disease Duration (years)", 
         y = "Number of Flares in Last Year") +
    theme(legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'

Onset vs Flares (all - cat)

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         !str_detect(COHORT2, "Ardea")) %>% 
  ggplot(aes(x = FLARE_CAT, y = AGE1ATK, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Onset (years)") +
    theme(legend.title = element_blank())
## Warning: Removed 55 rows containing non-finite values (stat_summary).

## Warning: Removed 55 rows containing non-finite values (stat_summary).

## Warning: Removed 55 rows containing non-finite values (stat_summary).

Onset vs Flares (> 1 - cat)

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         NUMATK >= 2) %>% 
  ggplot(aes(x = FLARE_CAT, y = AGE1ATK, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Age at Onset (years)") +
    theme(legend.title = element_blank())
## Warning: Removed 22 rows containing non-finite values (stat_summary).

## Warning: Removed 22 rows containing non-finite values (stat_summary).

## Warning: Removed 22 rows containing non-finite values (stat_summary).

Duration vs Flares (all - cat)

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         !str_detect(COHORT2, "Ardea")) %>% 
  ggplot(aes(x = FLARE_CAT, y = DURATION, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Disease Duration (years)") +
    theme(legend.title = element_blank())
## Warning: Removed 55 rows containing non-finite values (stat_summary).

## Warning: Removed 55 rows containing non-finite values (stat_summary).

## Warning: Removed 55 rows containing non-finite values (stat_summary).

Duration vs Flares (> 1 - cat)

all_cohorts %>% 
  filter(GOUT, !is.na(FLARE_CAT),
         NUMATK >= 2) %>% 
  ggplot(aes(x = FLARE_CAT, y = DURATION, color = COHORT2)) +
    stat_summary(geom = "point", fun = mean) +
    stat_summary(geom = "errorbar", fun.data = mean_se, width = 0.3) +
    stat_summary(geom = "line", fun = mean, aes(group = COHORT2)) +
    facet_wrap(~ SEX) +
    labs(x = "Number of Flares in Last Year (categorical)", y = "Disease Duration (years)") +
    theme(legend.title = element_blank())
## Warning: Removed 22 rows containing non-finite values (stat_summary).

## Warning: Removed 22 rows containing non-finite values (stat_summary).

## Warning: Removed 22 rows containing non-finite values (stat_summary).


Table of MAF for each SNP in each cohort

# Making a table with SNPs as rows and columns representing MAF for each cohort for that SNP
SNPlist <- UKBB_Gene_OR$RSID
SNPlist2 <- Poly_Gene_OR$RSID

get_maf <- function(cohort, snps){
  test <- cohort %>% 
    select(all_of(snps)) %>% 
    mutate_all(factor, levels = 0:2) %>% 
    na.omit()
  
  tmp <- c()
  for(i in 1:ncol(test)){
    tmp[i] <- ((sum(test[[i]] == 1) + (sum(test[[i]] == 2) * 2)) / (2 * nrow(test))) %>% sprintf(fmt = "%#.3f")
  }
  return(tmp)
}

freq_table <- tibble("Cohort" = rep(cohortstring, each = length(SNPlist)), 
                     "MAF" = unlist(lapply(data_list2, function(x) get_maf(cohort = x, snps = SNPlist))),
                     "SNP" = rep(SNPlist, length(cohortstring))) %>%
  pivot_wider(names_from = SNP, values_from = MAF)

freq_table <- transpose_df(freq_table) %>% 
  column_to_rownames(var = "Cohort") %>% 
  mutate(across(.cols = 1:ncol(freq_table), ~ str_replace(string = .x, pattern = " ", replacement = "&nbsp;")))

row.names(freq_table) <- str_replace(row.names(freq_table), " ", "&nbsp;")

freq_table %>% 
  kable(col.names = clean_names,
        align = "c",
        escape = F) %>% 
  kable_styling("striped") %>% 
  scroll_box(width = "900px", height = "475px")
UK Biobank
Gout
Male
UK Biobank
Gout
Female
UK Biobank
Control
Male
UK Biobank
Control
Female
Aus/NZ European
Gout
Male
Aus/NZ European
Gout
Female
Aus/NZ European
Control
Male
Aus/NZ European
Control
Female
GlobalGout
Gout
Male
GlobalGout
Gout
Female
GlobalGout
Control
Male
GlobalGout
Control
Female
Ardea
LASSO
Gout
Male
Ardea
LASSO
Gout
Female
Ardea
CLEAR1
Gout
Male
Ardea
CLEAR1
Gout
Female
Ardea
CLEAR2
Gout
Male
Ardea
CLEAR2
Gout
Female
Ardea
CRYSTAL
Gout
Male
Ardea
CRYSTAL
Gout
Female
Ardea
LIGHT
Gout
Male
Ardea
LIGHT
Gout
Female
East Polynesian
Gout
Male
East Polynesian
Gout
Female
East Polynesian
Control
Male
East Polynesian
Control
Female
West Polynesian
Gout
Male
West Polynesian
Gout
Female
West Polynesian
Control
Male
West Polynesian
Control
Female
rs114165349 0.030 0.031 0.022 0.023 0.027 0.026 0.022 0.021 0.036 0.032 0.011 0.020 0.038 0.038 0.022 0.031 0.032 0.062 0.047 0.000 0.020 0.000 NaN NaN NaN NaN NaN NaN NaN NaN
rs1967017 0.496 0.469 0.462 0.461 0.494 0.449 0.470 0.472 0.497 0.504 0.578 0.474 0.537 0.515 0.500 0.344 0.521 0.375 0.503 0.375 0.564 0.611 NaN NaN NaN NaN NaN NaN NaN NaN
rs11264341 0.597 0.610 0.571 0.569 0.598 0.549 0.558 0.540 0.606 0.564 0.589 0.599 0.590 0.638 0.587 0.656 0.607 0.688 0.656 0.625 0.627 0.611 NaN NaN NaN NaN NaN NaN NaN NaN
rs1260326 0.437 0.432 0.390 0.392 0.453 0.479 0.401 0.401 0.469 0.392 0.522 0.500 0.461 0.492 0.482 0.562 0.509 0.750 0.471 0.375 0.495 0.444 NaN NaN NaN NaN NaN NaN NaN NaN
rs2581790 0.351 0.344 0.324 0.324 0.333 0.321 0.314 0.299 0.333 0.352 0.311 0.283 0.343 0.323 0.372 0.375 0.307 0.375 0.312 0.250 0.343 0.389 NaN NaN NaN NaN NaN NaN NaN NaN
rs10805346 0.656 0.653 0.567 0.570 0.647 0.674 0.573 0.568 0.647 0.624 0.633 0.592 0.678 0.654 0.686 0.719 0.659 0.688 0.721 0.875 0.637 0.778 NaN NaN NaN NaN NaN NaN NaN NaN
rs73225891 0.980 0.979 0.975 0.975 0.986 0.985 0.970 0.981 0.980 0.988 0.967 0.987 0.981 0.985 0.975 1.000 0.983 0.938 0.982 1.000 0.985 0.944 NaN NaN NaN NaN NaN NaN NaN NaN
rs938558 0.807 0.828 0.723 0.725 0.798 0.836 0.731 0.718 0.801 0.800 0.744 0.743 0.816 0.815 0.839 0.781 0.815 0.875 0.841 1.000 0.784 0.944 NaN NaN NaN NaN NaN NaN NaN NaN
rs11723439 0.871 0.893 0.801 0.803 0.876 0.890 0.806 0.797 0.863 0.876 0.856 0.855 0.876 0.915 0.901 0.875 0.888 0.875 0.903 1.000 0.873 0.944 NaN NaN NaN NaN NaN NaN NaN NaN
rs10939671 0.403 0.344 0.420 0.419 0.392 0.346 0.424 0.399 0.407 0.372 0.378 0.336 0.392 0.338 0.433 0.312 0.429 0.500 0.382 0.625 0.373 0.444 NaN NaN NaN NaN NaN NaN NaN NaN
rs2231142 0.198 0.156 0.110 0.113 0.214 0.205 0.110 0.116 0.196 0.180 0.122 0.092 0.222 0.169 0.247 0.281 0.219 0.312 0.274 0.125 0.196 0.056 NaN NaN NaN NaN NaN NaN NaN NaN
rs28366540 0.532 0.521 0.458 0.460 0.539 0.533 0.457 0.468 0.531 0.516 0.533 0.447 0.549 0.615 0.538 0.625 0.588 0.750 0.562 0.500 0.593 0.444 NaN NaN NaN NaN NaN NaN NaN NaN
rs1229984 0.033 0.026 0.022 0.022 0.033 0.031 0.023 0.024 0.052 0.048 0.056 0.086 0.047 0.031 0.047 0.031 0.075 0.000 0.041 0.000 0.049 0.056 NaN NaN NaN NaN NaN NaN NaN NaN
rs13160226 0.789 0.788 0.769 0.769 0.784 0.772 0.775 0.761 0.772 0.724 0.800 0.816 0.791 0.754 0.765 0.875 0.768 0.938 0.794 0.875 0.804 0.722 NaN NaN NaN NaN NaN NaN NaN NaN
rs13191182 0.352 0.355 0.331 0.332 0.343 0.321 0.335 0.347 0.364 0.328 0.444 0.388 0.351 0.338 0.392 0.375 0.343 0.250 0.350 0.375 0.431 0.333 NaN NaN NaN NaN NaN NaN NaN NaN
rs1165154 0.609 0.610 0.566 0.569 0.598 0.631 0.578 0.574 0.625 0.588 0.578 0.487 0.646 0.608 0.601 0.656 0.599 0.688 0.591 0.750 0.623 0.722 NaN NaN NaN NaN NaN NaN NaN NaN
rs13240065 0.892 0.895 0.871 0.871 0.898 0.874 0.875 0.866 0.890 0.880 0.922 0.882 0.887 0.954 0.913 0.875 0.895 0.875 0.909 1.000 0.917 1.000 NaN NaN NaN NaN NaN NaN NaN NaN
rs1171615 0.799 0.800 0.769 0.769 0.789 0.782 0.770 0.781 0.792 0.788 0.756 0.796 0.804 0.792 0.803 0.844 0.833 0.750 0.794 0.625 0.858 0.833 NaN NaN NaN NaN NaN NaN NaN NaN
rs7116077 0.774 0.773 0.760 0.761 0.750 0.731 0.782 0.770 0.778 0.780 0.800 0.783 0.778 0.777 0.776 0.750 0.753 0.812 0.724 0.750 0.770 0.722 NaN NaN NaN NaN NaN NaN NaN NaN
rs7943154 0.479 0.491 0.432 0.435 0.466 0.395 0.480 0.432 0.505 0.524 0.500 0.579 0.493 0.554 0.500 0.500 0.528 0.500 0.521 0.500 0.500 0.611 NaN NaN NaN NaN NaN NaN NaN NaN
rs11605121 0.210 0.193 0.184 0.186 0.198 0.200 0.176 0.201 0.235 0.212 0.222 0.158 0.224 0.192 0.215 0.375 0.285 0.125 0.250 0.000 0.260 0.333 NaN NaN NaN NaN NaN NaN NaN NaN
rs948493 0.368 0.360 0.341 0.342 0.384 0.354 0.332 0.348 0.382 0.360 0.411 0.401 0.376 0.354 0.363 0.375 0.410 0.562 0.359 0.250 0.373 0.500 NaN NaN NaN NaN NaN NaN NaN NaN
rs3741414 0.786 0.777 0.756 0.756 0.799 0.790 0.758 0.754 0.781 0.728 0.822 0.809 0.809 0.823 0.848 0.844 0.815 0.875 0.812 0.875 0.814 0.833 NaN NaN NaN NaN NaN NaN NaN NaN
rs7484733 0.550 0.529 0.521 0.523 0.516 0.559 0.530 0.532 0.530 0.508 0.533 0.480 0.535 0.523 0.502 0.500 0.539 0.188 0.524 0.625 0.578 0.611 NaN NaN NaN NaN NaN NaN NaN NaN
rs138993217 0.184 0.181 0.167 0.167 0.182 0.154 0.170 0.180 0.155 0.172 0.100 0.197 0.156 0.169 0.166 0.156 0.167 0.312 0.176 0.125 0.157 0.167 NaN NaN NaN NaN NaN NaN NaN NaN
rs12973279 0.271 0.258 0.254 0.252 0.281 0.267 0.245 0.271 0.262 0.292 0.267 0.263 0.293 0.254 0.271 0.250 0.266 0.375 0.271 0.500 0.314 0.222 NaN NaN NaN NaN NaN NaN NaN NaN
rs738408 0.799 0.808 0.784 0.784 0.802 0.803 0.782 0.801 0.781 0.812 0.767 0.763 0.780 0.792 0.753 0.750 0.813 0.875 0.797 0.750 0.755 0.889 NaN NaN NaN NaN NaN NaN NaN NaN
# Now for directly genotyped PRS
freq_table <- tibble("Cohort" = rep(cohortstring, each = length(SNPlist2)), 
                     "MAF" = unlist(lapply(data_list, function(x) get_maf(cohort = x, snps = SNPlist2))),
                     "SNP" = rep(SNPlist2, length(cohortstring))) %>% 
  pivot_wider(names_from = SNP, values_from = MAF)

freq_table <- transpose_df(freq_table) %>% 
  column_to_rownames(var = "Cohort") %>% 
  mutate(across(.cols = 1:ncol(freq_table), ~ str_replace(string = .x, pattern = " ", replacement = "&nbsp;")))

row.names(freq_table) <- str_replace(row.names(freq_table), " ", "&nbsp;")

freq_table %>% 
  kable(col.names = clean_names,
        align = "c",
        escape = F) %>% 
  kable_styling("striped") %>% 
  scroll_box(width = "900px", height = "475px")
UK Biobank
Gout
Male
UK Biobank
Gout
Female
UK Biobank
Control
Male
UK Biobank
Control
Female
Aus/NZ European
Gout
Male
Aus/NZ European
Gout
Female
Aus/NZ European
Control
Male
Aus/NZ European
Control
Female
GlobalGout
Gout
Male
GlobalGout
Gout
Female
GlobalGout
Control
Male
GlobalGout
Control
Female
Ardea
LASSO
Gout
Male
Ardea
LASSO
Gout
Female
Ardea
CLEAR1
Gout
Male
Ardea
CLEAR1
Gout
Female
Ardea
CLEAR2
Gout
Male
Ardea
CLEAR2
Gout
Female
Ardea
CRYSTAL
Gout
Male
Ardea
CRYSTAL
Gout
Female
Ardea
LIGHT
Gout
Male
Ardea
LIGHT
Gout
Female
East Polynesian
Gout
Male
East Polynesian
Gout
Female
East Polynesian
Control
Male
East Polynesian
Control
Female
West Polynesian
Gout
Male
West Polynesian
Gout
Female
West Polynesian
Control
Male
West Polynesian
Control
Female
rs10910845 0.495 0.474 0.462 0.461 0.490 0.450 0.471 0.472 0.508 0.510 0.578 0.474 0.536 0.515 0.498 0.344 0.521 0.375 0.503 0.375 0.564 0.611 0.681 0.725 0.678 0.636 0.704 0.662 0.647 0.617
rs11264341 0.597 0.599 0.571 0.569 0.594 0.552 0.559 0.540 0.606 0.599 0.589 0.599 0.589 0.638 0.587 0.656 0.607 0.688 0.656 0.625 0.627 0.611 0.447 0.482 0.435 0.459 0.390 0.522 0.404 0.343
rs1260326 0.437 0.429 0.391 0.392 0.449 0.469 0.401 0.401 0.471 0.423 0.522 0.500 0.462 0.492 0.482 0.562 0.509 0.750 0.471 0.375 0.495 0.444 0.329 0.345 0.312 0.291 0.363 0.331 0.234 0.275
rs9847710 0.453 0.444 0.424 0.424 0.450 0.427 0.422 0.413 0.447 0.452 0.378 0.368 0.458 0.446 0.491 0.438 0.408 0.500 0.415 0.375 0.446 0.611 0.466 0.494 0.426 0.471 0.660 0.618 0.651 0.646
rs7675964 0.807 0.825 0.724 0.727 0.795 0.820 0.731 0.718 0.798 0.811 0.733 0.743 0.817 0.815 0.839 0.781 0.815 0.875 0.841 1.000 0.784 0.944 0.778 0.801 0.766 0.752 0.572 0.654 0.534 0.533
rs4481233 0.877 0.894 0.809 0.811 0.874 0.884 0.815 0.800 0.868 0.883 0.856 0.855 0.883 0.915 0.906 0.875 0.893 0.938 0.903 1.000 0.873 0.944 0.969 0.974 0.958 0.953 0.975 0.978 0.972 0.955
rs2276961 0.430 0.379 0.455 0.453 0.424 0.382 0.453 0.444 0.439 0.398 0.389 0.382 0.423 0.377 0.460 0.344 0.470 0.562 0.394 0.625 0.422 0.444 0.279 0.231 0.281 0.285 0.475 0.419 0.500 0.479
rs2231142 0.199 0.156 0.110 0.113 0.218 0.199 0.110 0.116 0.196 0.189 0.122 0.092 0.221 0.169 0.247 0.281 0.219 0.312 0.274 0.125 0.196 0.056 0.106 0.076 0.062 0.066 0.468 0.397 0.232 0.214
rs10011796 0.532 0.515 0.458 0.460 0.537 0.524 0.456 0.468 0.526 0.520 0.533 0.447 0.549 0.615 0.538 0.625 0.588 0.750 0.562 0.500 0.588 0.444 0.582 0.605 0.562 0.558 0.681 0.640 0.622 0.601
rs2762353 0.608 0.604 0.565 0.569 0.600 0.630 0.578 0.574 0.611 0.597 0.578 0.487 0.644 0.608 0.601 0.656 0.599 0.688 0.591 0.750 0.623 0.722 0.721 0.719 0.696 0.705 0.758 0.757 0.718 0.714
rs35332062 0.893 0.891 0.871 0.872 0.899 0.872 0.877 0.867 0.890 0.885 0.922 0.882 0.887 0.946 0.915 0.875 0.895 0.875 0.909 1.000 0.917 1.000 0.960 0.965 0.956 0.946 0.969 0.971 0.970 0.967
rs1171616 0.798 0.800 0.768 0.768 0.790 0.782 0.771 0.781 0.794 0.783 0.756 0.796 0.803 0.792 0.803 0.844 0.830 0.750 0.794 0.625 0.858 0.833 0.947 0.950 0.938 0.940 0.987 0.956 0.979 0.979
rs17300741 0.495 0.490 0.449 0.451 0.484 0.408 0.489 0.449 0.531 0.533 0.478 0.572 0.497 0.577 0.511 0.469 0.532 0.500 0.524 0.500 0.515 0.667 0.793 0.813 0.799 0.774 0.822 0.816 0.865 0.864
rs7937990 0.210 0.188 0.185 0.187 0.197 0.192 0.176 0.201 0.231 0.209 0.222 0.158 0.223 0.192 0.215 0.375 0.285 0.125 0.250 0.000 0.260 0.333 0.445 0.459 0.459 0.405 0.456 0.434 0.369 0.385
rs11227281 0.369 0.362 0.342 0.343 0.381 0.344 0.334 0.352 0.375 0.365 0.411 0.395 0.376 0.354 0.368 0.406 0.410 0.562 0.362 0.250 0.373 0.500 0.401 0.365 0.397 0.401 0.259 0.250 0.216 0.246
rs1106766 0.785 0.777 0.757 0.756 0.798 0.784 0.759 0.756 0.793 0.753 0.833 0.809 0.808 0.823 0.848 0.844 0.815 0.875 0.809 0.875 0.814 0.833 0.927 0.956 0.933 0.937 0.978 0.971 0.966 0.974
rs28652632 0.546 0.532 0.521 0.523 0.517 0.552 0.529 0.532 0.532 0.497 0.533 0.480 0.534 0.523 0.500 0.500 0.539 0.188 0.521 0.625 0.578 0.556 0.702 0.690 0.652 0.672 0.597 0.574 0.557 0.573
rs738409 0.799 0.803 0.784 0.784 0.804 0.806 0.783 0.801 0.775 0.788 0.767 0.763 0.780 0.792 0.753 0.750 0.813 0.875 0.797 0.750 0.755 0.889 0.765 0.807 0.753 0.800 0.728 0.757 0.757 0.779
# rs35332062 is monomorphic in females in western polynesians so it should be removed from the individual SNP analysis, though it can be left in the PRS


Running all models of interest

The purpose of this document is to run all models of interest for assessing the relationship between gout genetic risk and severity of gout. This includes modeling gout vs both PRS’s, and modeling age at onset, tophi, and flare frequency vs both PRS’s. For now, I have decided to not run flare frequency models and to just display the plots while discussing how complex the flare phenotype is.

# Datasets
load(here("Output/all_pheno_prs.RData"))

# Making FLARE_CAT variable and setting all control gout severity traits to NA and removing any non-Europeans with an imputed PRS
all_pheno_prs <- all_pheno_prs %>%
  mutate(FLARE_CAT = factor(case_when(between(NUMATK, 0, 5) ~ paste0(as.character(NUMATK), " flares in last year"),
                                      between(NUMATK, 6, 11) ~ "One every one to two months", 
                                      between(NUMATK, 12, 52) ~ "One or more per month"),
                            levels = c(paste0(0:5, " flares in last year"),
                                       "One every one to two months",
                                       "One or more per month"),
                            labels = c(paste0(0:5), 
                                       "6 - 11",
                                       "12 - 52"),
                            ordered = TRUE),
         AGE1ATK = case_when(GOUT ~ AGE1ATK),
         DURATION = case_when(GOUT ~ DURATION),
         NUMATK = case_when(GOUT ~ NUMATK),
         TOPHIGOUT = case_when(GOUT ~ TOPHIGOUT),
         ULT = case_when(GOUT ~ ULT),
         SEX = factor(SEX, levels = c("Male", "Female")),
         GOUT2 = factor(GOUT, levels = c(TRUE, FALSE), labels = c("Gout", "Control")),
         GROUP = factor(case_when(GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Control",
                                  GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian Control",
                                  GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Control"), 
                        levels = c("European Gout", "European Control", "East Polynesian Gout", "East Polynesian Control", "West Polynesian Gout", "West Polynesian Control")),
         GROUP2 = factor(case_when(GROUP == "European Gout" & SEX == "Male" ~ "European Gout - male",
                                  GROUP == "European Gout" & SEX == "Female" ~ "European Gout - female",
                                  GROUP == "European Control" & SEX == "Male" ~ "European Control - male",
                                  GROUP == "European Control" & SEX == "Female" ~ "European Control - female",
                                  GROUP == "East Polynesian Gout" & SEX == "Male" ~ "East Polynesian Gout - male",
                                  GROUP == "East Polynesian Gout" & SEX == "Female" ~ "East Polynesian Gout - female",
                                  GROUP == "East Polynesian Control" & SEX == "Male" ~ "East Polynesian Control - male",
                                  GROUP == "East Polynesian Control" & SEX == "Female" ~ "East Polynesian Control - female",
                                  GROUP == "West Polynesian Gout" & SEX == "Male" ~ "West Polynesian Gout - male",
                                  GROUP == "West Polynesian Gout" & SEX == "Female" ~ "West Polynesian Gout - female",
                                  GROUP == "West Polynesian Control" & SEX == "Male" ~ "West Polynesian Control - male",
                                  GROUP == "West Polynesian Control" & SEX == "Female" ~ "West Polynesian Control - female"), 
                        levels = c("European Gout - male", "European Gout - female", "European Control - male", "European Control - female", "East Polynesian Gout - male", "East Polynesian Gout - female", "East Polynesian Control - male", "East Polynesian Control - female", "West Polynesian Gout - male", "West Polynesian Gout - female", "West Polynesian Control - male", "West Polynesian Control - female")),
         GROUP3 = factor(case_when(GOUT & SEX == "Male" ~ "Male Gout",
                                   GOUT & SEX == "Female" ~ "Female Gout",
                                   !GOUT & SEX == "Male" ~ "Male Control",
                                   !GOUT & SEX == "Female" ~ "Female Control"), 
                         levels = c("Male Gout", "Female Gout", "Male Control", "Female Control")),
         COHORT2 = factor(case_when(GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Gout",
                                    !GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Control",
                                    GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Control",
                                    GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Gout",
                                    !GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Control",
                                    Pheno.Study == "Ardea: 401" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LASSO",
                                    Pheno.Study == "Ardea: CLEAR1" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR1",
                                    Pheno.Study == "Ardea: CLEAR2" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR2",
                                    Pheno.Study == "Ardea: CRYSTAL" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CRYSTAL",
                                    Pheno.Study == "Ardea: LIGHT" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LIGHT",
                                    GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian - Control",
                                    GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Control"), 
                          levels = c("UK Biobank - Gout", "UK Biobank - Control", "Aus/NZ - Gout", "Aus/NZ - Control", "GlobalGout - Gout", "GlobalGout - Control", "Ardea - LASSO", "Ardea - CLEAR1", "Ardea - CLEAR2", "Ardea - CRYSTAL", "Ardea - LIGHT", "East Polynesian - Gout", "East Polynesian - Control", "West Polynesian - Gout", "West Polynesian - Control")),
         NUMATK = round(NUMATK)) %>% 
  filter(Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") | !(Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")) & is.na(PRS1),
         !is.na(AGECOL),
         !is.na(PRS2),
         !(is.na(PRS1) & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
         (Pheno.Study == "UK Biobank" | !GOUT | GOUT & !(is.na(AGE1ATK) & is.na(NUMATK) & is.na(TOPHIGOUT))),
         !is.na(COHORT2))

all_pheno_prs_male <- all_pheno_prs %>% 
  filter(SEX == "Male")

all_pheno_prs_female <- all_pheno_prs %>% 
  filter(SEX == "Female")

data_list <- list("UK Biobank - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                             Pheno.Study == "UK Biobank"),
                  "UK Biobank - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                 Pheno.Study == "UK Biobank"),
                  "UK Biobank - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                Pheno.Study == "UK Biobank"),
                  "UK Biobank - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                    Pheno.Study == "UK Biobank"),
                  "Aus/NZ European - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                  Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "Aus/NZ European - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                      Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                      Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "Aus/NZ European - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                     Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                     Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "Aus/NZ European - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                         Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                         Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "GlobalGout - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                             Pheno.Study == "EuroGout",
                                                                             Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "GlobalGout - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                 Pheno.Study == "EuroGout",
                                                                                 Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "GlobalGout - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                Pheno.Study == "EuroGout",
                                                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "GlobalGout - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                    Pheno.Study == "EuroGout",
                                                                                    Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LASSO - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: 401",
                                                                         Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LASSO - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: 401",
                                                                             Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR1 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                                          Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR1 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                                              Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR2 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                                          Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR2 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                                              Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CRYSTAL - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                                           Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CRYSTAL - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                                               Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LIGHT - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                                         Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LIGHT - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                                             Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "East Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                                  Geno.SpecificAncestry %in% c("East Polynesian")),
                  "East Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                      Geno.SpecificAncestry %in% c("East Polynesian")),
                  "East Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                     Geno.SpecificAncestry %in% c("East Polynesian")),
                  "East Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                         Geno.SpecificAncestry %in% c("East Polynesian")),
                  "West Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                                  Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  "West Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                      Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  "West Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                     Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  "West Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                         Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))

load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Poly_Gene_OR.RData"))
OR <- function(x, Predictor) {
  sprintf(exp(coef(x))[[Predictor]], fmt = "%#.4f")
}

LCL_OR <- function(x, Predictor) {
  sprintf(exp(confint.default(x))[Predictor, 1], fmt = "%#.4f")
}

UCL_OR <- function(x, Predictor) {
  sprintf(exp(confint.default(x))[Predictor, 2], fmt = "%#.4f")
}

Pval <- function(x, Predictor) {
  signif(summary(x)$coefficients[Predictor, 4], 3)
}

Beta <- function(x, Predictor) {
  sprintf(coef(x)[[Predictor]], fmt = "%#.4f")
}

LCL <- function(x, Predictor) {
  sprintf(confint.default(x)[Predictor, 1], fmt = "%#.4f")
}

UCL <- function(x, Predictor) {
  sprintf(confint.default(x)[Predictor, 2], fmt = "%#.4f")
}

First, I want to test the assumptions of the models of interest. For linear regression, the assumptions are:

  1. Linear relationship between variables
  2. Normality of residuals
  3. Homoscedasticity of residuals
  4. No multicollinearity (for multiple regression)

Logistic regression only relies on no multicollinearity.

All assumptions should be tested for all models, but I will just test some representative models instead - the main thing is that extremely non-normal variables should not be used for linear regression. This should be tested for the age at onset variable, and the number of flares per year variable.

# Age at onset -------------------------------------------------------------------------------------
# Representative linear model of age at onset vs the imputed PRS in the combined male gout cohort
tmp <- all_pheno_prs %>% 
  filter(SEX == "Male",
         GOUT,
         Geno.SpecificAncestry %in% c("European", "Iberian", "European; Iberian"))

ggplot(tmp, aes(x = PRS1, y = AGE1ATK, color = COHORT2)) +
  geom_point(shape = 1) +
  geom_smooth(se = FALSE) # No evidence of non-linearity

mod <- lm(AGE1ATK ~ PRS1 + Geno.PCVector1 + Geno.PCVector2 + Geno.PCVector3 + Geno.PCVector4 + Geno.PCVector5 + Geno.PCVector6 + Geno.PCVector7 + Geno.PCVector8 + Geno.PCVector9 + Geno.PCVector10, data = tmp)

test <- augment(mod)

ggplot(test, aes(x = .fitted, y = .resid)) +
  geom_point() # no obvious pattern, so homoscedasticity seems to be met

ggplot(data = test, mapping = aes(x = .resid)) + 
  geom_histogram(mapping = aes(y = ..density..), bins = 30, fill = 'gray', color = 'black') +
  stat_function(fun = 'dnorm', 
                args = list(mean = mean(test$.resid), sd = sd(test$.resid)), 
                color = 'red')

ggplot(data = test, mapping = aes(sample = .resid)) +
  geom_qq() +
  geom_qq_line()

ggplot(data = test, mapping = aes(x = .resid)) +
  geom_boxplot() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

test %>% 
  pull(.resid) %>% 
  stat.desc(basic = FALSE, desc = FALSE, norm = TRUE) %>% 
  enframe() %>% 
  pivot_wider(names_from = name, values_from = value)

# Normality of residuals is completely met here (ignore 2SE and shapiro-wilk test as this is large data)

vif(mod) # no multicollinearity (no values over 5 or 10)

# So we should have no problem running models of the PRS vs age at onset (as long as we don't include variables that exhibit multicollinearity)



# Flares (linear) ----------------------------------------------------------------------------------------
# First let's run a representative linear model of NUMATK vs PRS in the combined male gout cohort
tmp <- tmp %>% 
  mutate(NUMATK = case_when(NUMATK > 52 ~ 52,
                            TRUE ~ NUMATK)) %>% 
  filter(NUMATK >= 2)

ggplot(tmp, aes(x = PRS1, y = NUMATK, color = COHORT2)) +
  geom_point(shape = 1) +
  geom_smooth(se = FALSE)

mod <- lm(NUMATK ~ PRS1 + Geno.PCVector1 + Geno.PCVector2 + Geno.PCVector3 + Geno.PCVector4 + Geno.PCVector5 + Geno.PCVector6 + Geno.PCVector7 + Geno.PCVector8 + Geno.PCVector9 + Geno.PCVector10, data = tmp)

test <- augment(mod)

ggplot(test, aes(x = .fitted, y = .resid)) +
  geom_point() # weird pattern, so homoscedasticity seems to not be met

ggplot(data = test, mapping = aes(x = .resid)) + 
  geom_histogram(mapping = aes(y = ..density..), bins = 30, fill = 'gray', color = 'black') +
  stat_function(fun = 'dnorm', 
                args = list(mean = mean(test$.resid), sd = sd(test$.resid)), 
                color = 'red')

ggplot(data = test, mapping = aes(sample = .resid)) +
  geom_qq() +
  geom_qq_line()

ggplot(data = test, mapping = aes(x = .resid)) +
  geom_boxplot() +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

test %>% 
  pull(.resid) %>% 
  stat.desc(basic = FALSE, desc = FALSE, norm = TRUE) %>% 
  enframe() %>% 
  pivot_wider(names_from = name, values_from = value)

# Normality of residuals is not met here (ignore 2SE and shapiro-wilk test as this is large data)

vif(mod) # no multicollinearity (no values over 5 or 10)

# So it is not appropriate to use the NUMATK variable in a linear regression model



# Flares (ordinal) -----------------------------------------------------------------

# I will go ahead and test whether categorizing the variable (FLARE_CAT) and running an independent ANOVA or ordinal logistic regression model will be appropriate - note I am not sure how easy it will be to adjust for covariates in an ANOVA (perhaps I could run an ANCOVA but it might not be doing what I think it's doing)

# The ordinal logistic regression test should tell us the odds of being in any combination of higher flare categories vs the remainder of the categories (i.e. highest flare group vs all others, or highest 5 flare groups vs lowest flare group)

# I'll try running an example from a paper to see if I can replicate their results in R

# dat <- tibble("Group" = rep(c("Activator", "Headgear"), each = 50),
#               "Happiness" = factor(c(rep("Unhappy", 30), rep("Somewhat happy", 14), rep("Very happy", 6), rep("Unhappy", 14), rep("Somewhat happy", 25), rep("Very happy", 11)), levels = c("Unhappy", "Somewhat happy", "Very happy"), ordered = T))
# 
# mod <- MASS::polr(Happiness ~ Group, data = dat, Hess = TRUE)
# 
# modsum <- tibble("Group of interest" = row.names(summary(mod)$coefficients)[1],
#                  "OR" = exp(summary(mod)$coefficients)[1],
#                  "Lower CI" = exp(confint.default(mod))[1],
#                  "Upper CI" = exp(confint.default(mod))[2],
#                  "P-value" = pnorm(abs(summary(mod)$coefficients[1, "t value"]), lower.tail = FALSE) * 2)

# To interpret the odds ratio, we can say that for the group of children that wore Headgear, the odds of being more happy (i.e. being in a happiness tier or higher compared to a lower tier) were 3.27 times those of the the children that used an Activator 

# The parallel to this in our example would be that for those individuals with 1 unit more PRS than another group, the odds of being in a higher flare category (i.e. >= 3 flares vs < 3 flares) were X times those of the other group (i.e. those with 1 unit PRS lower - the reference group). Let's test this using an example model

mod <- polr(FLARE_CAT ~ PRS2, data = tmp, Hess = TRUE)

modsum <- tibble("Group of interest" = row.names(summary(mod)$coefficients)[1],
                 "OR" = exp(summary(mod)$coefficients)[1],
                 "Lower CI" = exp(confint.default(mod))[1],
                 "Upper CI" = exp(confint.default(mod))[2],
                 "P-value" = pnorm(abs(summary(mod)$coefficients[1, "t value"]), lower.tail = FALSE) * 2)

# This suggests that the model is significant for the full male gout cohort, with an OR of 1.11 [1.00, 1.24], p = 0.047 per PRS unit

# Can try to visualize this by looking at proportions of FLARE_CAT within bins of PRS

tmp %>% 
  filter(!is.na(PRS2)) %>% 
  mutate(PRS_bin = factor(ntile(PRS2, 6), ordered = T),
         FLARE_CAT = factor(case_when(is.na(FLARE_CAT) ~ "No Data",
                                      TRUE ~ as.character(FLARE_CAT)), 
                            levels = rev(c(paste0(0:5), 
                                       "6 - 11",
                                       "12 - 52",
                                       "No Data")),
                            ordered = TRUE)) %>%
  group_by(PRS_bin, FLARE_CAT, SEX, GROUP) %>% 
  summarize(value = n()) %>% 
  ggplot(aes(fill = FLARE_CAT, y = value, x = PRS_bin)) +
  geom_bar(position = "fill", stat = "identity") +
  facet_wrap(~ SEX * GROUP) +
  scale_fill_discrete(type = c("#C0C0C0", "#FDE725FF", "#9FDA3AFF", "#4AC16DFF", "#1FA187FF", "#277F8EFF", "#365C8DFF", "#46337EFF", "#440154FF")) +
  theme(axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        legend.title = element_blank())



# Flares (ANOVA) --------------------------------------------------------------------------------------

# The ANOVA test will essentially ask "does the mean PRS differ between flare categories"
# I should further note that I am unclear on how to meta-analyze ANOVA results - perhaps just running them within each cohort and determining how many show significant differences and, of those, which groups were different and in which direction? The other option is to pull cohorts but that runs into potentially weird issues of bias. So I think in the end I should run both models and see if I get a different answer, then decide how to present them in the paper.

# The assumptions of a one-way independent ANOVA are:

# 1. Independence of the datapoints (met)
# 2. Normality of model residuals within groups
# 3. Homogeneity of variance of model residuals within groups

mod <- tmp %>% 
  mutate(IID = factor(IID),
         FLARE_CAT = factor(FLARE_CAT)) %>% 
  ezANOVA(data = .,
               dv = PRS2,
               between = FLARE_CAT,
               wid = IID,
               type = 3,
               return_aov = TRUE)

mod

tmp %>% 
  ggplot(aes(x = FLARE_CAT, y = PRS2)) +
  geom_boxplot()

rm(tmp, mod, modsum, test)

Based on the above, it should be appropriate to run linear regression models on age at onset, but not flare frequency. For flare frequency, I could use ordinal logistic regression after categorizing flares into roughly even groups (this regression assumes that the effect between two levels of the variable are consistent across levels of the variable). I could also use ANOVA but this may prove difficult to adjust for covariates. For now I won’t run any models for this variable.


# We need to model both PRS and all SNPs against gout, adjusting for global PCs. It needs to be in males and females separately, and needs to be both adjusted for age at collection and unadjusted. It also needs to run with the 10 oceanian PCs for Polynesian cohorts. We want the output of all of these models to be stored in a list object which we can then extract all of the important elements from in a table format.

gout_data_list <- list("UK Biobank - Male" = full_join(data_list[["UK Biobank - Gout - Male"]], 
                                                       data_list[["UK Biobank - Control - Male"]]),
                       "UK Biobank - Female" = full_join(data_list[["UK Biobank - Gout - Female"]], 
                                                         data_list[["UK Biobank - Control - Female"]]),
                       "Aus/NZ European - Male" = full_join(data_list[["Aus/NZ European - Gout - Male"]],
                                                            data_list[["Aus/NZ European - Control - Male"]]),
                       "Aus/NZ European - Female" = full_join(data_list[["Aus/NZ European - Gout - Female"]],
                                                              data_list[["Aus/NZ European - Control - Female"]]),
                       "East Polynesian - Male" = full_join(data_list[["East Polynesian - Gout - Male"]],
                                                            data_list[["East Polynesian - Control - Male"]]),
                       "East Polynesian - Female" = full_join(data_list[["East Polynesian - Gout - Female"]],
                                                              data_list[["East Polynesian - Control - Female"]]),
                       "West Polynesian - Male" = full_join(data_list[["West Polynesian - Gout - Male"]],
                                                            data_list[["West Polynesian - Control - Male"]]),
                       "West Polynesian - Female" = full_join(data_list[["West Polynesian - Gout - Female"]],
                                                              data_list[["West Polynesian - Control - Female"]]))

# I want to model the following for every cohort: genotyped PRS, all SNPs in the genotyped PRS at the same time; next, for European cohorts only, I want to model the imputed PRS and all of its SNPs at the same time; all models should be run with and without AGECOL as a covariate


modlist <- vector("list", length(gout_data_list))
for(i in seq_along(gout_data_list)){
  prsnames <- list("PRS2", Poly_Gene_OR$RSID)
  
  if(!str_detect(names(gout_data_list)[i], "Polynesian")){
    prsnames <- append(prsnames, list("PRS1", UKBB_Gene_OR$RSID))
  }
  
  covariates <- c()
  
  if(!str_detect(names(gout_data_list)[i], "UK Biobank")){
    covariates <- c(covariates, "Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
  }
  
  if(str_detect(names(gout_data_list)[i], "Polynesian")){
    covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
  }
  
  covariates2 <- c(covariates, "AGECOL")
  
  tmplist <- vector("list", 2 * length(prsnames))
  
  for(j in seq_along(prsnames)) {
    variables <- c(prsnames[[j]], covariates)
    f <- as.formula(paste("GOUT", paste(variables, collapse = " + "), sep = " ~ "))
    assign(paste0("Model_", i, "_", j), glm(f, family = binomial, data = gout_data_list[[i]]))
    # columns will be: Cohort, N, N case, N control, Outcome, Predictors, Predictor, Covariates, OR, LCL, UCL, Pval, log-odds, standard error
    modstring1 <- c(names(gout_data_list)[[i]],
                    nrow(gout_data_list[[i]]),
                    nrow(gout_data_list[[i]] %>% filter(GOUT)),
                    nrow(gout_data_list[[i]] %>% filter(!GOUT)),
                    "Gout")
    
    modstring <- list()
    if(any(str_detect(prsnames[[j]], "PRS"))) {
      modstring[[1]] <- c(modstring1,
                          paste(prsnames[[j]], collapse = " + "),
                          paste(prsnames[[j]], collapse = " + "),
                          paste(covariates, collapse = " + "),
                          OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]]]],
                          summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], 2])
    } else{
      for(k in 1:length(prsnames[[j]])){
        modstring[[k]] <- c(modstring1,
                            paste(prsnames[[j]], collapse = " + "),
                            paste(prsnames[[j]][k], collapse = " + "),
                            paste(covariates, collapse = " + "),
                            OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]][k]]],
                            summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], 2])
      }
    }
    
    tmplist[[j]] <- modstring
    
    variables <- c(prsnames[[j]], covariates2)
    f <- as.formula(paste("GOUT", paste(variables, collapse = " + "), sep = " ~ "))
    assign(paste0("Model_", i, "_", j, "_adj"), glm(f, family = binomial, data = gout_data_list[[i]]))
    modstring1 <- c(names(gout_data_list)[[i]],
                    nrow(gout_data_list[[i]]),
                    nrow(gout_data_list[[i]] %>% filter(GOUT)),
                    nrow(gout_data_list[[i]] %>% filter(!GOUT)),
                    "Gout")
    
    modstring <- list()
    if(any(str_detect(prsnames[[j]], "PRS"))) {
      modstring[[1]] <- c(modstring1,
                          paste(prsnames[[j]], collapse = " + "),
                          paste(prsnames[[j]], collapse = " + "),
                          paste(covariates2, collapse = " + "),
                          OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]]]],
                          summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]], 2])
    } else{
      for(k in 1:length(prsnames[[j]])){
        modstring[[k]] <- c(modstring1,
                            paste(prsnames[[j]], collapse = " + "),
                            paste(prsnames[[j]][k], collapse = " + "),
                            paste(covariates2, collapse = " + "),
                            OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]][k]]],
                            summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]][k], 2])
      }
    }
    
    tmplist[[j + length(prsnames)]] <- modstring
  }
  
  modlist[[i]] <- tmplist
}

remove <- ls()
remove <- as_tibble(remove) %>% 
  filter(str_detect(value, "Model_"))
remove <- remove$value
rm(list = remove, remove)


tmp <- modlist %>% 
  flatten() %>% 
  flatten() %>% 
  as.data.frame() %>% 
  data.table::transpose()

colnames(tmp) <- c("Cohort", "N", "N case", "N control", "Outcome", "Predictors", "Predictor", "Covariates", "OR", "LCL", "UCL", "Pval", "log-odds", "SE")

GoutModels <- tmp %>% 
  mutate(across(c(Cohort, Outcome, Predictors, Predictor, Covariates), factor),
         across(c(N, `N case`, `N control`, OR, LCL, UCL, Pval, `log-odds`, SE), as.numeric))

#save(GoutModels, file = here("Output/GoutModels.RData"))

rm(modlist, tmp, tmplist, covariates, covariates2, f, i, j, k, modstring, modstring1, prsnames, variables)
# We need to model both PRSs and all SNPs against age at onset, tophi, and flare category (by both ordinal logistic regression and ANOVA), adjusting for global PCs. It needs to be in males and females separately, and needs to be both adjusted for disease duration (for tophi) and unadjusted. It also needs to run with the 10 oceanian PCs for Polynesian cohorts. We want the output of all of these models to be stored in a list object which we can then extract all of the important elements from in a table format.

# Onset --------------------------------------------------------
onset_data_list <- list("Aus/NZ European - Male" = data_list[["Aus/NZ European - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "Aus/NZ European - Female" = data_list[["Aus/NZ European - Gout - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "GlobalGout - Male" = data_list[["GlobalGout - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "GlobalGout - Female" = data_list[["GlobalGout - Gout - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - LASSO - Male" = data_list[["Ardea - LASSO - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - LASSO - Female" = data_list[["Ardea - LASSO - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - CLEAR1 - Male" = data_list[["Ardea - CLEAR1 - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - CLEAR1 - Female" = data_list[["Ardea - CLEAR1 - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - CLEAR2 - Male" = data_list[["Ardea - CLEAR2 - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - CLEAR2 - Female" = data_list[["Ardea - CLEAR2 - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - CRYSTAL - Male" = data_list[["Ardea - CRYSTAL - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - CRYSTAL - Female" = data_list[["Ardea - CRYSTAL - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - LIGHT - Male" = data_list[["Ardea - LIGHT - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "Ardea - LIGHT - Female" = data_list[["Ardea - LIGHT - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "East Polynesian - Male" = data_list[["East Polynesian - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "East Polynesian - Female" = data_list[["East Polynesian - Gout - Female"]] %>% filter(!is.na(AGE1ATK)),
                        "West Polynesian - Male" = data_list[["West Polynesian - Gout - Male"]] %>% filter(!is.na(AGE1ATK)),
                        "West Polynesian - Female" = data_list[["West Polynesian - Gout - Female"]] %>% filter(!is.na(AGE1ATK)))

for(i in length(onset_data_list):1){
  if(nrow(onset_data_list[[i]]) < 50){
    onset_data_list[[i]] <- NULL
  }
}

modlist <- vector("list", length(onset_data_list))
for(i in seq_along(onset_data_list)){
  prsnames <- list("PRS2", Poly_Gene_OR$RSID)
  
  if(!str_detect(names(onset_data_list)[i], "Polynesian")){
    prsnames <- append(prsnames, list("PRS1", UKBB_Gene_OR$RSID))
  }
  
  covariates <- c()
  
  if(!str_detect(names(onset_data_list)[i], "UK Biobank")){
    covariates <- c(covariates, "Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
  }
  
  if(str_detect(names(onset_data_list)[i], "Polynesian")){
    covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
  }
  
  tmplist <- vector("list", length(prsnames))
  
  for(j in seq_along(prsnames)) {
    variables <- c(prsnames[[j]], covariates)
    f <- as.formula(paste("AGE1ATK", paste(variables, collapse = " + "), sep = " ~ "))
    assign(paste0("Model_", i, "_", j), lm(f, data = onset_data_list[[i]]))
    # columns will be: Cohort, N, Outcome, Predictors, Predictor, Covariates, Beta, LCL, UCL, Pval, standard error
    modstring1 <- c(names(onset_data_list)[[i]],
                    nrow(onset_data_list[[i]]),
                    "Age at Onset (years)")
    
    modstring <- list()
    if(any(str_detect(prsnames[[j]], "PRS"))) {
      modstring[[1]] <- c(modstring1,
                          paste(prsnames[[j]], collapse = " + "),
                          paste(prsnames[[j]], collapse = " + "),
                          paste(covariates, collapse = " + "),
                          Beta(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          LCL(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          UCL(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], 2])
    } else{
      for(k in 1:length(prsnames[[j]])){
        modstring[[k]] <- c(modstring1,
                            paste(prsnames[[j]], collapse = " + "),
                            paste(prsnames[[j]][k], collapse = " + "),
                            paste(covariates, collapse = " + "),
                            Beta(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            LCL(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            UCL(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], 2])
      }
    }
    
    tmplist[[j]] <- modstring
  }
  
  modlist[[i]] <- tmplist
}

remove <- ls()
remove <- as_tibble(remove) %>% 
  filter(str_detect(value, "Model_"))
remove <- remove$value
rm(list = remove, remove)


tmp <- modlist %>% 
  flatten() %>% 
  flatten() %>% 
  as.data.frame() %>% 
  data.table::transpose()

colnames(tmp) <- c("Cohort", "N", "Outcome", "Predictors", "Predictor", "Covariates", "Beta", "LCL", "UCL", "Pval", "SE")

OnsetModels <- tmp %>% 
  mutate(across(c(Cohort, Outcome, Predictors, Predictor, Covariates), factor),
         across(c(N, Beta, LCL, UCL, Pval, SE), as.numeric))

#save(OnsetModels, file = here("Output/OnsetModels.RData"))

rm(modlist, tmp, tmplist, covariates, f, i, j, k, modstring, modstring1, prsnames, variables)



# Tophi -------------------------------------------------------------------------------
tophi_data_list <- list("Aus/NZ European - Male" = data_list[["Aus/NZ European - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Aus/NZ European - Female" = data_list[["Aus/NZ European - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)),
                        "GlobalGout - Male" = data_list[["GlobalGout - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "GlobalGout - Female" = data_list[["GlobalGout - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - LASSO - Male" = data_list[["Ardea - LASSO - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - LASSO - Female" = data_list[["Ardea - LASSO - Female"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - CLEAR1 - Male" = data_list[["Ardea - CLEAR1 - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - CLEAR1 - Female" = data_list[["Ardea - CLEAR1 - Female"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - CLEAR2 - Male" = data_list[["Ardea - CLEAR2 - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - CLEAR2 - Female" = data_list[["Ardea - CLEAR2 - Female"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - LIGHT - Male" = data_list[["Ardea - LIGHT - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "Ardea - LIGHT - Female" = data_list[["Ardea - LIGHT - Female"]] %>% filter(!is.na(TOPHIGOUT)),
                        "East Polynesian - Male" = data_list[["East Polynesian - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "East Polynesian - Female" = data_list[["East Polynesian - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)),
                        "West Polynesian - Male" = data_list[["West Polynesian - Gout - Male"]] %>% filter(!is.na(TOPHIGOUT)),
                        "West Polynesian - Female" = data_list[["West Polynesian - Gout - Female"]] %>% filter(!is.na(TOPHIGOUT)))

for(i in length(tophi_data_list):1){
  if(nrow(tophi_data_list[[i]]) < 50){
    tophi_data_list[[i]] <- NULL
  }
}



modlist <- vector("list", length(tophi_data_list))
for(i in seq_along(tophi_data_list)){
  prsnames <- list("PRS2", Poly_Gene_OR$RSID)
  
  if(!str_detect(names(tophi_data_list)[i], "Polynesian")){
    prsnames <- append(prsnames, list("PRS1", UKBB_Gene_OR$RSID))
  }
  
  covariates <- c()
  
  if(!str_detect(names(tophi_data_list)[i], "UK Biobank")){
    covariates <- c(covariates, "Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
  }
  
  if(str_detect(names(tophi_data_list)[i], "Polynesian")){
    covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
  }
  
  covariates2 <- c(covariates, "DURATION")
  
  tmplist <- vector("list", 2 * length(prsnames))
  
  for(j in seq_along(prsnames)) {
    variables <- c(prsnames[[j]], covariates)
    f <- as.formula(paste("TOPHIGOUT", paste(variables, collapse = " + "), sep = " ~ "))
    assign(paste0("Model_", i, "_", j), glm(f, family = binomial, data = tophi_data_list[[i]]))
    # columns will be: Cohort, N, N case, N control, Outcome, Predictors, Predictor, Covariates, OR, LCL, UCL, Pval, log-odds, standard error
    modstring1 <- c(names(tophi_data_list)[[i]],
                    nrow(tophi_data_list[[i]]),
                    nrow(tophi_data_list[[i]] %>% filter(TOPHIGOUT)),
                    nrow(tophi_data_list[[i]] %>% filter(!TOPHIGOUT)),
                    "Tophi")
    
    modstring <- list()
    if(any(str_detect(prsnames[[j]], "PRS"))) {
      modstring[[1]] <- c(modstring1,
                          paste(prsnames[[j]], collapse = " + "),
                          paste(prsnames[[j]], collapse = " + "),
                          paste(covariates, collapse = " + "),
                          OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]]),
                          coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]]]],
                          summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], 2])
    } else{
      for(k in 1:length(prsnames[[j]])){
        modstring[[k]] <- c(modstring1,
                            paste(prsnames[[j]], collapse = " + "),
                            paste(prsnames[[j]][k], collapse = " + "),
                            paste(covariates, collapse = " + "),
                            OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            LCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            UCL_OR(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            Pval(get(paste0("Model_", i, "_", j)), prsnames[[j]][k]),
                            coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]][k]]],
                            summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], 2])
      }
    }
    
    tmplist[[j]] <- modstring
    
    variables <- c(prsnames[[j]], covariates2)
    f <- as.formula(paste("TOPHIGOUT", paste(variables, collapse = " + "), sep = " ~ "))
    assign(paste0("Model_", i, "_", j, "_adj"), glm(f, family = binomial, data = tophi_data_list[[i]]))
    modstring1 <- c(names(tophi_data_list)[[i]],
                    nrow(tophi_data_list[[i]]),
                    nrow(tophi_data_list[[i]] %>% filter(TOPHIGOUT)),
                    nrow(tophi_data_list[[i]] %>% filter(!TOPHIGOUT)),
                    "Tophi")
    
    modstring <- list()
    if(any(str_detect(prsnames[[j]], "PRS"))) {
      modstring[[1]] <- c(modstring1,
                          paste(prsnames[[j]], collapse = " + "),
                          paste(prsnames[[j]], collapse = " + "),
                          paste(covariates2, collapse = " + "),
                          OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]]),
                          coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]]]],
                          summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]], 2])
    } else{
      for(k in 1:length(prsnames[[j]])){
        modstring[[k]] <- c(modstring1,
                            paste(prsnames[[j]], collapse = " + "),
                            paste(prsnames[[j]][k], collapse = " + "),
                            paste(covariates2, collapse = " + "),
                            OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            LCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            UCL_OR(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            Pval(get(paste0("Model_", i, "_", j, "_adj")), prsnames[[j]][k]),
                            coef(get(paste0("Model_", i, "_", j, "_adj")))[[prsnames[[j]][k]]],
                            summary(get(paste0("Model_", i, "_", j, "_adj")))$coefficients[prsnames[[j]][k], 2])
      }
    }
    
    tmplist[[j + length(prsnames)]] <- modstring
  }
  
  modlist[[i]] <- tmplist
}

remove <- ls()
remove <- as_tibble(remove) %>% 
  filter(str_detect(value, "Model_"))
remove <- remove$value
rm(list = remove, remove)


tmp <- modlist %>% 
  flatten() %>% 
  flatten() %>% 
  as.data.frame() %>% 
  data.table::transpose()

colnames(tmp) <- c("Cohort", "N", "N case", "N control", "Outcome", "Predictors", "Predictor", "Covariates", "OR", "LCL", "UCL", "Pval", "log-odds", "SE")

TophiModels <- tmp %>% 
  mutate(across(c(Cohort, Outcome, Predictors, Predictor, Covariates), factor),
         across(c(N, `N case`, `N control`, OR, LCL, UCL, Pval, `log-odds`, SE), as.numeric))

#save(TophiModels, file = here("Output/TophiModels.RData"))

rm(modlist, tmp, tmplist, covariates, covariates2, f, i, j, k, modstring, modstring1, prsnames, variables)



# Flare models ------------------------------------------------------------------------------------
# First let's do the polr based models with FLARE_CAT as the outcome, which should be done in GlobalGout, ANZ, and Polynesian cohorts with all categories, then subsequently done in all cohorts excluding those with < 2 flares
# flare_data_list1 <- list("Aus/NZ European - Male" = data_list[["Aus/NZ European - Gout - Male"]] %>% filter(!is.na(FLARE_CAT)),
#                          "Aus/NZ European - Female" = data_list[["Aus/NZ European - Gout - Female"]] %>% filter(!is.na(FLARE_CAT)),
#                          "GlobalGout - Male" = data_list[["GlobalGout - Gout - Male"]] %>% filter(!is.na(FLARE_CAT)),
#                          "GlobalGout - Female" = data_list[["GlobalGout - Gout - Female"]] %>% filter(!is.na(FLARE_CAT)),
#                          "East Polynesian - Male" = data_list[["East Polynesian - Gout - Male"]] %>% filter(!is.na(FLARE_CAT)),
#                          "East Polynesian - Female" = data_list[["East Polynesian - Gout - Female"]] %>% filter(!is.na(FLARE_CAT)),
#                          "West Polynesian - Male" = data_list[["West Polynesian - Gout - Male"]] %>% filter(!is.na(FLARE_CAT)),
#                          "West Polynesian - Female" = data_list[["West Polynesian - Gout - Female"]] %>% filter(!is.na(FLARE_CAT)))
# 
# for(i in length(flare_data_list1):1){
#   if(nrow(flare_data_list1[[i]]) < 50){
#     flare_data_list1[[i]] <- NULL
#   }
# }
# 
# 
# 
# flare_data_list2 <- list("Aus/NZ European - Male" = data_list[["Aus/NZ European - Gout - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Aus/NZ European - Female" = data_list[["Aus/NZ European - Gout - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "GlobalGout - Male" = data_list[["GlobalGout - Gout - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "GlobalGout - Female" = data_list[["GlobalGout - Gout - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - LASSO - Male" = data_list[["Ardea - LASSO - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - LASSO - Female" = data_list[["Ardea - LASSO - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - CLEAR1 - Male" = data_list[["Ardea - CLEAR1 - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - CLEAR1 - Female" = data_list[["Ardea - CLEAR1 - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - CLEAR2 - Male" = data_list[["Ardea - CLEAR2 - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - CLEAR2 - Female" = data_list[["Ardea - CLEAR2 - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - CRYSTAL - Male" = data_list[["Ardea - CRYSTAL - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - CRYSTAL - Female" = data_list[["Ardea - CRYSTAL - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - LIGHT - Male" = data_list[["Ardea - LIGHT - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "Ardea - LIGHT - Female" = data_list[["Ardea - LIGHT - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "East Polynesian - Male" = data_list[["East Polynesian - Gout - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "East Polynesian - Female" = data_list[["East Polynesian - Gout - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "West Polynesian - Male" = data_list[["West Polynesian - Gout - Male"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1),
#                          "West Polynesian - Female" = data_list[["West Polynesian - Gout - Female"]] %>% filter(!is.na(FLARE_CAT), NUMATK > 1))
# 
# for(i in length(flare_data_list2):1){
#   if(nrow(flare_data_list2[[i]]) < 50){
#     flare_data_list2[[i]] <- NULL
#   }
# }
# 
# modlist <- vector("list", length(flare_data_list1))
# for(i in seq_along(flare_data_list1)){
#   prsnames <- list("PRS2", Poly_Gene_OR$RSID)
#   
#   if(!str_detect(names(flare_data_list1)[i], "Polynesian")){
#     prsnames <- append(prsnames, list("PRS1", UKBB_Gene_OR$RSID))
#   }
#   
#   covariates <- c()
#   
#   if(!str_detect(names(flare_data_list1)[i], "UK Biobank")){
#     covariates <- c(covariates, "Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
#   }
#   
#   if(str_detect(names(flare_data_list1)[i], "Polynesian")){
#     covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
#   }
#   
#   tmplist <- vector("list", length(prsnames))
#   
#   for(j in seq_along(prsnames)) {
#     variables <- c(prsnames[[j]], covariates)
#     f <- as.formula(paste("FLARE_CAT", paste(variables, collapse = " + "), sep = " ~ "))
#     assign(paste0("Model_", i, "_", j), polr(f, Hess = TRUE, data = flare_data_list1[[i]]))
#     # columns will be: Cohort, N, N group 0, N group 1, N group 2, N group 3, N group 4, N group 5, N group 6 - 11, N group 12 - 52, Outcome, Predictors, Predictor, Covariates, OR, LCL, UCL, Pval, log-odds, standard error
#     modstring1 <- c(names(flare_data_list1)[[i]],
#                     nrow(flare_data_list1[[i]]),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "0")),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "1")),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "2")),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "3")),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "4")),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "5")),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "6 - 11")),
#                     nrow(flare_data_list1[[i]] %>% filter(FLARE_CAT == "12 - 52")),
#                     "Number of Flares in Last Year (categorical)")
#     
#     modstring <- list()
#     if(any(str_detect(prsnames[[j]], "PRS"))) {
#       modstring[[1]] <- c(modstring1,
#                           paste(prsnames[[j]], collapse = " + "),
#                           paste(prsnames[[j]], collapse = " + "),
#                           paste(covariates, collapse = " + "),
#                           exp(summary(get(paste0("Model_", i, "_", j)))$coefficients)[prsnames[[j]]],
#                           exp(confint.default(get(paste0("Model_", i, "_", j))))[prsnames[[j]], 1],
#                           exp(confint.default(get(paste0("Model_", i, "_", j))))[prsnames[[j]], 2],
#                           pnorm(abs(summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], "t value"]), lower.tail = FALSE) * 2,
#                           coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]]]],
#                           summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]], 2])
#     } else{
#       for(k in 1:length(prsnames[[j]])){
#         modstring[[k]] <- c(modstring1,
#                             paste(prsnames[[j]], collapse = " + "),
#                             paste(prsnames[[j]][k], collapse = " + "),
#                             paste(covariates, collapse = " + "),
#                             exp(summary(get(paste0("Model_", i, "_", j)))$coefficients)[prsnames[[j]][k]],
#                             exp(confint.default(get(paste0("Model_", i, "_", j))))[prsnames[[j]][k], 1],
#                             exp(confint.default(get(paste0("Model_", i, "_", j))))[prsnames[[j]][k], 2],
#                             pnorm(abs(summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], "t value"]), lower.tail = FALSE) * 2,
#                             coef(get(paste0("Model_", i, "_", j)))[[prsnames[[j]][k]]],
#                             summary(get(paste0("Model_", i, "_", j)))$coefficients[prsnames[[j]][k], 2])
#       }
#     }
#     
#     tmplist[[j]] <- modstring
#   }
#   
#   modlist[[i]] <- tmplist
# }
# 
# remove <- ls()
# remove <- as_tibble(remove) %>% 
#   filter(str_detect(value, "Model_"))
# remove <- remove$value
# rm(list = remove, remove)
# 
# 
# tmp <- modlist %>% 
#   flatten() %>% 
#   flatten() %>% 
#   as.data.frame() %>% 
#   data.table::transpose()
# 
# colnames(tmp) <- c("Cohort", "N", "N group 0", "N group 1", "N group 2", "N group 3", "N group 4", "N group 5", "N group 6 - 11", "N group 12 - 52", "Outcome", "Predictors", "Predictor", "Covariates", "OR", "LCL", "UCL", "Pval", "log-odds", "SE")
# 
# FlareModels1 <- tmp %>% 
#   mutate(across(c(Cohort, Outcome, Predictors, Predictor, Covariates), factor),
#          across(c(N, `N group 0`, `N group 1`, `N group 2`, `N group 3`, `N group 4`, `N group 5`, `N group 6 - 11`, `N group 12 - 52`, OR, LCL, UCL, Pval, `log-odds`, SE), as.numeric))
# 
# 
# 
# modlist <- vector("list", length(flare_data_list1))
# for(i in seq_along(flare_data_list1)){
#   tmplist <- vector("list", length(prsnames))
#   for(j in seq_along(prsnames)) {
#     # columns will be: Cohort, N, N group 0, N group 1, N group 2, N group 3, N group 4, N group 5, N group 6 - 11, N group 12 - 52, Outcome, Predictor, Covariates, OR, LCL, UCL, Pval, log-odds, standard error
#     modstring <- c(names(flare_data_list1)[[i]],
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]))),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "0")),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "1")),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "2")),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "3")),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "4")),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "5")),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "6 - 11")),
#                    nrow(flare_data_list1[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "12 - 52")),
#                    "Number of Flares in Last Year (categorical)",
#                    prsnames[[j]],
#                    paste(covariates, collapse = " + "),
#                    exp(summary(get(paste0("Model_", i, "_", j)))$coefficients)[1],
#                    exp(confint.default(get(paste0("Model_", i, "_", j))))[1, 1],
#                    exp(confint.default(get(paste0("Model_", i, "_", j))))[1, 2],
#                    pnorm(abs(summary(get(paste0("Model_", i, "_", j)))$coefficients[1, "t value"]), lower.tail = FALSE) * 2,
#                    coef(get(paste0("Model_", i, "_", j)))[[1]],
#                    summary(get(paste0("Model_", i, "_", j)))$coefficients[1, 2])
#     tmplist[[j]] <- modstring
#   }
#   
#   modlist[[i]] <- tmplist
# }
# 
# remove <- ls()
# remove <- as_tibble(remove) %>% 
#   filter(str_detect(value, "Model_"))
# remove <- remove$value
# rm(list = remove, remove)
# 
# 
# 
# out <- data.frame()
# for(i in 1:length(modlist)){
#   tmp <- modlist[[i]] %>% 
#     map(as.data.frame) %>% 
#     bind_cols() %>% 
#     data.table::transpose()
#   
#   colnames(tmp) <- c("Cohort", "N", "N group 0", "N group 1", "N group 2", "N group 3", "N group 4", "N group 5", "N group 6 - 11", "N group 12 - 52", "Outcome", "Predictor", "Covariates", "OR", "LCL", "UCL", "Pval", "log-odds", "SE")
#   
#   out <- rbind(out, tmp)
# }
# 
# FlareModels1 <- out
# 
# FlareModels1 <- FlareModels1 %>% 
#   mutate(across(c(Cohort, Outcome, Predictor, Covariates), factor),
#          across(c(N, `N group 0`, `N group 1`, `N group 2`, `N group 3`, `N group 4`, `N group 5`, `N group 6 - 11`, `N group 12 - 52`, OR, LCL, UCL, Pval, `log-odds`, SE), as.numeric))
# 
# #save(FlareModels1, file = here("Output/FlareModels1.RData"))
# 
# rm(modlist, out, tmp, tmplist, covariates, f, i, j, modstring, prsnames, variables)
# 
# modlist <- vector("list", length(flare_data_list2))
# for(i in seq_along(flare_data_list2)){
#   if(flare_data_list2[[i]] %>% nrow() > 50){
#     prsnames <- flare_data_list2[[i]] %>% 
#       select(PRS1, PRS2, starts_with("rs")) %>% 
#       select(where(~ !all(is.na(.x)))) %>% 
#       colnames()
#   
#     covariates <- c("Geno.PCVector1", "Geno.PCVector2", "Geno.PCVector3", "Geno.PCVector4", "Geno.PCVector5", "Geno.PCVector6", "Geno.PCVector7", "Geno.PCVector8", "Geno.PCVector9", "Geno.PCVector10")
#     
#     
#     if(str_detect(names(flare_data_list2)[i], "Polynesian")){
#       covariates <- c(covariates, "Geno.PCVector1_Oc", "Geno.PCVector2_Oc", "Geno.PCVector3_Oc", "Geno.PCVector4_Oc", "Geno.PCVector5_Oc", "Geno.PCVector6_Oc", "Geno.PCVector7_Oc", "Geno.PCVector8_Oc", "Geno.PCVector9_Oc", "Geno.PCVector10_Oc")
#     }
#     
#     tmplist <- vector("list", length(prsnames))
#     for(j in seq_along(prsnames)) {
#       variables <- c(prsnames[j], covariates)
#       f <- as.formula(paste("FLARE_CAT", paste(variables, collapse = " + "), sep = " ~ "))
#       assign(paste0("Model_", i, "_", j), polr(f, Hess = TRUE, data = flare_data_list2[[i]]))
#       # columns will be: Cohort, N, N group 0, N group 1, N group 2, N group 3, N group 4, N group 5, N group 6 - 11, N group 12 - 52, Outcome, Predictor, Covariates, OR, LCL, UCL, Pval, log-odds, standard error
#       modstring <- c(names(flare_data_list2)[[i]],
#                      nrow(flare_data_list2[[i]] %>% filter(!is.na(prsnames[[j]]))),
#                      nrow(flare_data_list2[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "2")),
#                      nrow(flare_data_list2[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "3")),
#                      nrow(flare_data_list2[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "4")),
#                      nrow(flare_data_list2[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "5")),
#                      nrow(flare_data_list2[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "6 - 11")),
#                      nrow(flare_data_list2[[i]] %>% filter(!is.na(prsnames[[j]]), FLARE_CAT == "12 - 52")),
#                      "Number of Flares in Last Year (categorical)",
#                      prsnames[[j]],
#                      paste(covariates, collapse = " + "),
#                      exp(summary(get(paste0("Model_", i, "_", j)))$coefficients)[1],
#                      exp(confint.default(get(paste0("Model_", i, "_", j))))[1, 1],
#                      exp(confint.default(get(paste0("Model_", i, "_", j))))[1, 2],
#                      pnorm(abs(summary(get(paste0("Model_", i, "_", j)))$coefficients[1, "t value"]), lower.tail = FALSE) * 2,
#                      coef(get(paste0("Model_", i, "_", j)))[[1]],
#                      summary(get(paste0("Model_", i, "_", j)))$coefficients[1, 2])
#       tmplist[[j]] <- modstring
#     }
#     
#     modlist[[i]] <- tmplist
#   }
# }
# 
# remove <- ls()
# remove <- as_tibble(remove) %>% 
#   filter(str_detect(value, "Model_"))
# remove <- remove$value
# rm(list = remove, remove)
# 
# 
# 
# out <- data.frame()
# for(i in seq_along(modlist)){
#   if(!is.null(modlist[[i]])){
#     tmp <- modlist[[i]] %>% 
#       map(as.data.frame) %>% 
#       bind_cols() %>% 
#       data.table::transpose()
#   } else{
#     tmp <- data.frame(matrix(ncol = 17, nrow = 0))
#   }
#   
#   colnames(tmp) <- c("Cohort", "N", "N group 2", "N group 3", "N group 4", "N group 5", "N group 6 - 11", "N group 12 - 52", "Outcome", "Predictor", "Covariates", "OR", "LCL", "UCL", "Pval", "log-odds", "SE")
#   
#   out <- rbind(out, tmp)
# }
# 
# FlareModels2 <- out
# 
# FlareModels2 <- FlareModels2 %>% 
#   mutate(across(c(Cohort, Outcome, Predictor, Covariates), factor),
#          across(c(N, `N group 2`, `N group 3`, `N group 4`, `N group 5`, `N group 6 - 11`, `N group 12 - 52`, OR, LCL, UCL, Pval, `log-odds`, SE), as.numeric))
# 
# #save(FlareModels2, file = here("Output/FlareModels2.RData"))
# 
# rm(modlist, out, tmp, tmplist, covariates, f, i, j, modstring, prsnames, variables)


Making Plots and Tables for the Final Manuscript

# Datasets
load(here("Output/all_pheno_prs.RData"))

all_pheno_prs <- all_pheno_prs %>%
  mutate(FLARE_CAT = factor(case_when(between(NUMATK, 0, 5) ~ paste0(as.character(NUMATK), " flares in last year"),
                                      between(NUMATK, 6, 11) ~ "One every one to two months", 
                                      between(NUMATK, 12, 52) ~ "One or more per month"),
                            levels = c(paste0(0:5, " flares in last year"),
                                       "One every one to two months",
                                       "One or more per month"),
                            labels = c(paste0(0:5), 
                                       "6 - 11",
                                       "12 - 52"),
                            ordered = TRUE),
         AGE1ATK = case_when(GOUT ~ AGE1ATK),
         DURATION = case_when(GOUT ~ DURATION),
         NUMATK = case_when(GOUT ~ NUMATK),
         TOPHIGOUT = case_when(GOUT ~ TOPHIGOUT),
         ULT = case_when(GOUT ~ ULT),
         SEX = factor(SEX, levels = c("Male", "Female")),
         GOUT2 = factor(GOUT, levels = c(TRUE, FALSE), labels = c("Gout", "Control")),
         GROUP = factor(case_when(GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "European Control",
                                  GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian Control",
                                  GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Gout",
                                  !GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian Control"), 
                        levels = c("European Gout", "European Control", "East Polynesian Gout", "East Polynesian Control", "West Polynesian Gout", "West Polynesian Control")),
         GROUP2 = factor(case_when(GROUP == "European Gout" & SEX == "Male" ~ "European Gout - male",
                                  GROUP == "European Gout" & SEX == "Female" ~ "European Gout - female",
                                  GROUP == "European Control" & SEX == "Male" ~ "European Control - male",
                                  GROUP == "European Control" & SEX == "Female" ~ "European Control - female",
                                  GROUP == "East Polynesian Gout" & SEX == "Male" ~ "East Polynesian Gout - male",
                                  GROUP == "East Polynesian Gout" & SEX == "Female" ~ "East Polynesian Gout - female",
                                  GROUP == "East Polynesian Control" & SEX == "Male" ~ "East Polynesian Control - male",
                                  GROUP == "East Polynesian Control" & SEX == "Female" ~ "East Polynesian Control - female",
                                  GROUP == "West Polynesian Gout" & SEX == "Male" ~ "West Polynesian Gout - male",
                                  GROUP == "West Polynesian Gout" & SEX == "Female" ~ "West Polynesian Gout - female",
                                  GROUP == "West Polynesian Control" & SEX == "Male" ~ "West Polynesian Control - male",
                                  GROUP == "West Polynesian Control" & SEX == "Female" ~ "West Polynesian Control - female"), 
                        levels = c("European Gout - male", "European Gout - female", "European Control - male", "European Control - female", "East Polynesian Gout - male", "East Polynesian Gout - female", "East Polynesian Control - male", "East Polynesian Control - female", "West Polynesian Gout - male", "West Polynesian Gout - female", "West Polynesian Control - male", "West Polynesian Control - female")),
         GROUP3 = factor(case_when(GOUT & SEX == "Male" ~ "Male Gout",
                                   GOUT & SEX == "Female" ~ "Female Gout",
                                   !GOUT & SEX == "Male" ~ "Male Control",
                                   !GOUT & SEX == "Female" ~ "Female Control"), 
                         levels = c("Male Gout", "Female Gout", "Male Control", "Female Control")),
         COHORT2 = factor(case_when(GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Gout",
                                    !GOUT & Pheno.Study == "UK Biobank" ~ "UK Biobank - Control",
                                    GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") & Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease") ~ "Aus/NZ - Control",
                                    GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Gout",
                                    !GOUT & Pheno.Study == "EuroGout" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "GlobalGout - Control",
                                    Pheno.Study == "Ardea: 401" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LASSO",
                                    Pheno.Study == "Ardea: CLEAR1" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR1",
                                    Pheno.Study == "Ardea: CLEAR2" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CLEAR2",
                                    Pheno.Study == "Ardea: CRYSTAL" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - CRYSTAL",
                                    Pheno.Study == "Ardea: LIGHT" & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") ~ "Ardea - LIGHT",
                                    GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("East Polynesian") ~ "East Polynesian - Control",
                                    GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Gout",
                                    !GOUT & Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan") ~ "West Polynesian - Control"), 
                          levels = c("UK Biobank - Gout", "UK Biobank - Control", "Aus/NZ - Gout", "Aus/NZ - Control", "GlobalGout - Gout", "GlobalGout - Control", "Ardea - LASSO", "Ardea - CLEAR1", "Ardea - CLEAR2", "Ardea - CRYSTAL", "Ardea - LIGHT", "East Polynesian - Gout", "East Polynesian - Control", "West Polynesian - Gout", "West Polynesian - Control")),
         NUMATK = round(NUMATK)) %>% 
  filter(Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian") | !(Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")) & is.na(PRS1),
         !is.na(AGECOL),
         !is.na(PRS2),
         !(is.na(PRS1) & Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
         (Pheno.Study == "UK Biobank" | !GOUT | GOUT & !(is.na(AGE1ATK) & is.na(NUMATK) & is.na(TOPHIGOUT))),
         !is.na(COHORT2))

all_pheno_prs_male <- all_pheno_prs %>% 
  filter(SEX == "Male")

all_pheno_prs_female <- all_pheno_prs %>% 
  filter(SEX == "Female")

data_list <- list("UK Biobank - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                             Pheno.Study == "UK Biobank"),
                  "UK Biobank - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                 Pheno.Study == "UK Biobank"),
                  "UK Biobank - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                Pheno.Study == "UK Biobank"),
                  "UK Biobank - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                    Pheno.Study == "UK Biobank"),
                  "Aus/NZ European - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                                  Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                  Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "Aus/NZ European - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                      Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                      Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "Aus/NZ European - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                     Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                     Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "Aus/NZ European - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                         Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian"),
                                                                                         Pheno.Study %in% c("AGRIA", "Diabetes Mellitus", "Gout in Aotearoa", "LPA", "Ngati Porou", "Renal Disease")),
                  "GlobalGout - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                             Pheno.Study == "EuroGout",
                                                                             Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "GlobalGout - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                 Pheno.Study == "EuroGout",
                                                                                 Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "GlobalGout - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                Pheno.Study == "EuroGout",
                                                                                Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "GlobalGout - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                    Pheno.Study == "EuroGout",
                                                                                    Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LASSO - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: 401",
                                                                         Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LASSO - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: 401",
                                                                             Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR1 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                                          Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR1 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR1",
                                                                              Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR2 - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                                          Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CLEAR2 - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CLEAR2",
                                                                              Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CRYSTAL - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                                           Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - CRYSTAL - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: CRYSTAL",
                                                                               Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LIGHT - Male" = all_pheno_prs_male %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                                         Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "Ardea - LIGHT - Female" = all_pheno_prs_female %>% filter(Pheno.Study == "Ardea: LIGHT",
                                                                             Geno.SpecificAncestry %in% c("European", "European; Iberian", "Iberian")),
                  "East Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                                  Geno.SpecificAncestry %in% c("East Polynesian")),
                  "East Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                      Geno.SpecificAncestry %in% c("East Polynesian")),
                  "East Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                     Geno.SpecificAncestry %in% c("East Polynesian")),
                  "East Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                         Geno.SpecificAncestry %in% c("East Polynesian")),
                  "West Polynesian - Gout - Male" = all_pheno_prs_male %>% filter(GOUT,
                                                                                  Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  "West Polynesian - Gout - Female" = all_pheno_prs_female %>% filter(GOUT,
                                                                                      Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  "West Polynesian - Control - Male" = all_pheno_prs_male %>% filter(!GOUT,
                                                                                     Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")),
                  "West Polynesian - Control - Female" = all_pheno_prs_female %>% filter(!GOUT,
                                                                                         Geno.SpecificAncestry %in% c("West Polynesian", "Niuean", "Pukapukan")))

cohortstring <- c("UK Biobank - Gout - Male",
                  "UK Biobank - Gout - Female",
                  "UK Biobank - Control - Male",
                  "UK Biobank - Control - Female",
                  "Aus/NZ European - Gout - Male",
                  "Aus/NZ European - Gout - Female",
                  "Aus/NZ European - Control - Male",
                  "Aus/NZ European - Control - Female",
                  "GlobalGout - Gout - Male",
                  "GlobalGout - Gout - Female",
                  "GlobalGout - Control - Male",
                  "GlobalGout - Control - Female",
                  "Ardea - LASSO - Male",
                  "Ardea - LASSO - Female",
                  "Ardea - CLEAR1 - Male",
                  "Ardea - CLEAR1 - Female",
                  "Ardea - CLEAR2 - Male",
                  "Ardea - CLEAR2 - Female",
                  "Ardea - CRYSTAL - Male",
                  "Ardea - CRYSTAL - Female",
                  "Ardea - LIGHT - Male",
                  "Ardea - LIGHT - Female",
                  "East Polynesian - Gout - Male",
                  "East Polynesian - Gout - Female",
                  "East Polynesian - Control - Male",
                  "East Polynesian - Control - Female",
                  "West Polynesian - Gout - Male",
                  "West Polynesian - Gout - Female",
                  "West Polynesian - Control - Male",
                  "West Polynesian - Control - Female")

clean_names <- c("UK&nbsp;Biobank<br/>Gout<br/>Male",
                 "UK&nbsp;Biobank<br/>Gout<br/>Female",
                 "UK&nbsp;Biobank<br/>Control<br/>Male",
                 "UK&nbsp;Biobank<br/>Control<br/>Female",
                 "Aus/NZ&nbsp;European<br/>Gout<br/>Male",
                 "Aus/NZ&nbsp;European<br/>Gout<br/>Female",
                 "Aus/NZ&nbsp;European<br/>Control<br/>Male",
                 "Aus/NZ&nbsp;European<br/>Control<br/>Female",
                 "GlobalGout<br/>Gout<br/>Male",
                 "GlobalGout<br/>Gout<br/>Female",
                 "GlobalGout<br/>Control<br/>Male",
                 "GlobalGout<br/>Control<br/>Female",
                 "Ardea<br/>LASSO<br/>Gout<br/>Male",
                 "Ardea<br/>LASSO<br/>Gout<br/>Female",
                 "Ardea<br/>CLEAR1<br/>Gout<br/>Male",
                 "Ardea<br/>CLEAR1<br/>Gout<br/>Female",
                 "Ardea<br/>CLEAR2<br/>Gout<br/>Male",
                 "Ardea<br/>CLEAR2<br/>Gout<br/>Female",
                 "Ardea<br/>CRYSTAL<br/>Gout<br/>Male",
                 "Ardea<br/>CRYSTAL<br/>Gout<br/>Female",
                 "Ardea<br/>LIGHT<br/>Gout<br/>Male",
                 "Ardea<br/>LIGHT<br/>Gout<br/>Female",
                 "East&nbsp;Polynesian<br/>Gout<br/>Male",
                 "East&nbsp;Polynesian<br/>Gout<br/>Female",
                 "East&nbsp;Polynesian<br/>Control<br/>Male",
                 "East&nbsp;Polynesian<br/>Control<br/>Female",
                 "West&nbsp;Polynesian<br/>Gout<br/>Male",
                 "West&nbsp;Polynesian<br/>Gout<br/>Female",
                 "West&nbsp;Polynesian<br/>Control<br/>Male",
                 "West&nbsp;Polynesian<br/>Control<br/>Female")

load(here("Output/UKBB_Gene_OR.RData"))
load(here("Output/Poly_Gene_OR.RData"))

# Model results
load(here("Output/GoutModels.RData"))
load(here("Output/OnsetModels.RData"))
load(here("Output/TophiModels.RData"))
# load(here("Output/FlareModels1.RData"))
# load(here("Output/FlareModels2.RData"))
# Functions 
report <- function(x) {
    if(sum(is.na(x)) != length(x)) {
      paste0(sprintf(mean(x, na.rm = TRUE), fmt = "%#.1f"), " ± ", sprintf(sd(x, na.rm = TRUE), fmt = "%#.1f"))
    } else {
      paste0("NA")
    }
}

report_median <- function(x) {
    if(sum(is.na(x)) != length(x)) {
      paste0(median(x, na.rm =T), " (", summary(x)[[2]], " - ", summary(x)[[5]], ")")
    } else {
      paste0("NA")
    }
}

sumreport <- function(x) {
  if(sum(is.na(x)) != length(x)){
    paste0(sum(x, na.rm = TRUE), " (", sprintf((mean(x, na.rm = TRUE) * 100), fmt = "%#.1f"), ")")
  } else {
      paste0("NA")
  }
}

transpose_df <- function(df) {
  t_df <- data.table::transpose(df)
  colnames(t_df) <- rownames(df)
  rownames(t_df) <- colnames(df)
  t_df <- t_df %>%
    rownames_to_column() %>%
    as_tibble() %>% 
    row_to_names(row_number = 1)
  return(t_df)
}

missing <- function(x){
  if(sum(is.na(x)) == length(x)) {
    return("+")
    } else if(sum(!is.na(x)) == length(x)){
      return("-")
      } else {
      paste0(format(sum(is.na(x)), big.mark = ","), " (", format(round((sum(is.na(x)) / length(x) * 100), digits = 1), nsmall = 1), ")")
  }
}

The following table describes the cohort statistics for every variable that I have deemed to have sufficient non-missing data. If it is missing at more than 50% in a single cohort then that cohort will be set to “too much missing”. If there are no cohorts with fewer than 30% missing (excluding UKBB) then the variable is removed from the plot. This will probably end up as a supplementary table but could be table 1 in the paper.

table1 <- tibble("Cohort" = cohortstring, 
                 "N" = unlist(lapply(data_list, nrow)),
                 "Age at Collection (years)" = unlist(lapply(data_list, function(x) report(x$AGECOL))),
                 "Serum Urate (mg/dL)" = unlist(lapply(data_list, function(x) report(x$URATE))),
                 "ULT" = unlist(lapply(data_list, function(x) sumreport(x$ULT))),
                 "Age at Onset (years)" = unlist(lapply(data_list, function(x) report(x$AGE1ATK))),
                 "Disease Duration (years)" = unlist(lapply(data_list, function(x) report(x$DURATION))),
                 "Number of Flares in Last Year" = unlist(lapply(data_list, function(x) report_median(x$NUMATK))),
                 "Presence of Tophi" = unlist(lapply(data_list, function(x) sumreport(x$TOPHIGOUT))),
                 "PRS - Imputed" = unlist(lapply(data_list, function(x) report(x$PRS1))),
                 "PRS - Direct" = unlist(lapply(data_list, function(x) report(x$PRS2))),
                 "BMI" = unlist(lapply(data_list, function(x) report(x$BMI))),
                 "Type 2 Diabetes" = unlist(lapply(data_list, function(x) sumreport(x$DIABETES))))

table1 <- transpose_df(table1) %>% 
  column_to_rownames(var = "Cohort") %>% 
  mutate(across(.cols = 1:ncol(table1), ~ str_replace_all(string = .x, pattern = " ", replacement = "&nbsp;")))

row.names(table1) <- str_replace_all(row.names(table1), " ", "&nbsp;")

table1 %>% 
  kable(col.names = clean_names,
        align = "c",
        escape = F) %>% 
  kable_styling("striped") %>% 
  scroll_box(width = "900px", height = "475px") %>% 
  footnote("Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).")
UK Biobank
Gout
Male
UK Biobank
Gout
Female
UK Biobank
Control
Male
UK Biobank
Control
Female
Aus/NZ European
Gout
Male
Aus/NZ European
Gout
Female
Aus/NZ European
Control
Male
Aus/NZ European
Control
Female
GlobalGout
Gout
Male
GlobalGout
Gout
Female
GlobalGout
Control
Male
GlobalGout
Control
Female
Ardea
LASSO
Gout
Male
Ardea
LASSO
Gout
Female
Ardea
CLEAR1
Gout
Male
Ardea
CLEAR1
Gout
Female
Ardea
CLEAR2
Gout
Male
Ardea
CLEAR2
Gout
Female
Ardea
CRYSTAL
Gout
Male
Ardea
CRYSTAL
Gout
Female
Ardea
LIGHT
Gout
Male
Ardea
LIGHT
Gout
Female
East Polynesian
Gout
Male
East Polynesian
Gout
Female
East Polynesian
Control
Male
East Polynesian
Control
Female
West Polynesian
Gout
Male
West Polynesian
Gout
Female
West Polynesian
Control
Male
West Polynesian
Control
Female
N 7094 851 149748 181186 928 195 714 593 1017 125 45 76 776 65 223 16 233 8 170 4 102 9 563 161 306 460 458 61 218 212
Age at Collection (years) 60.0 ± 7.0 61.7 ± 6.1 57.0 ± 8.1 56.7 ± 7.9 62.7 ± 12.2 70.7 ± 12.4 55.9 ± 16.6 51.9 ± 17.2 60.1 ± 13.2 67.8 ± 10.8 57.6 ± 16.7 65.2 ± 11.0 51.3 ± 11.8 60.7 ± 10.6 52.4 ± 11.2 61.4 ± 7.4 53.1 ± 10.9 55.2 ± 14.8 54.4 ± 10.8 63.8 ± 5.4 54.1 ± 11.8 64.6 ± 15.1 55.3 ± 12.4 60.3 ± 12.1 45.3 ± 15.6 46.3 ± 15.2 47.5 ± 12.3 52.0 ± 14.4 39.6 ± 15.1 40.4 ± 15.2
Serum Urate (mg/dL) 6.7 ± 1.7 6.1 ± 2.0 5.9 ± 1.2 4.5 ± 1.1 6.7 ± 1.9 6.5 ± 2.4 5.5 ± 2.8 3.1 ± 2.6 7.4 ± 2.3 7.7 ± 2.7 7.0 ± 1.8 6.4 ± 1.7 8.9 ± 1.2 8.9 ± 1.4 7.8 ± 1.4 8.1 ± 1.2 7.9 ± 1.5 8.2 ± 2.0 8.8 ± 1.5 10.1 ± 1.2 9.3 ± 1.7 8.0 ± 1.4 7.0 ± 2.2 6.5 ± 2.5 6.5 ± 1.8 5.4 ± 1.6 7.7 ± 2.1 7.0 ± 2.8 6.7 ± 1.9 5.4 ± 1.7
ULT 4036 (56.9) 331 (38.9) NA NA 534 (99.8) 102 (99.0) NA NA 564 (75.9) 49 (58.3) NA NA 237 (30.7) 25 (39.1) 223 (100.0) 16 (100.0) 233 (100.0) 8 (100.0) 93 (100.0) 1 (100.0) 102 (100.0) 9 (100.0) 380 (92.0) 112 (91.1) NA NA 314 (95.2) 44 (95.7) NA NA
Age at Onset (years) NA NA NA NA 46.6 ± 15.7 60.6 ± 15.1 NA NA 46.4 ± 14.0 57.6 ± 12.5 NA NA 41.3 ± 13.4 55.5 ± 11.4 41.7 ± 12.3 55.2 ± 11.2 42.8 ± 13.3 47.6 ± 19.5 40.2 ± 13.3 61.5 ± 6.2 43.2 ± 13.3 52.0 ± 18.9 37.9 ± 14.2 48.8 ± 15.6 NA NA 34.3 ± 11.9 43.0 ± 15.7 NA NA
Disease Duration (years) NA NA NA NA 17.0 ± 12.8 10.4 ± 9.9 NA NA 14.6 ± 11.5 11.0 ± 10.4 NA NA 11.0 ± 9.5 6.2 ± 7.3 11.6 ± 9.7 7.1 ± 9.5 11.3 ± 10.0 8.6 ± 11.2 15.2 ± 10.3 3.2 ± 1.0 11.9 ± 9.0 13.6 ± 15.4 18.2 ± 13.5 13.2 ± 12.9 NA NA 13.9 ± 10.3 9.2 ± 8.7 NA NA
Number of Flares in Last Year NA NA NA NA 2 (0 - 4) 2 (0 - 4) NA NA 2 (1 - 4) 3 (1.5 - 4) NA NA 4 (3 - 8) 3 (3 - 6) 3 (2 - 6) 3 (3 - 4) 4 (2 - 6) 4.5 (2 - 6) 4 (3 - 6) 4.5 (2.25 - 6) 4 (2 - 9.5) 4 (3 - 5) 3 (1 - 6) 2 (0 - 5.5) NA NA 4 (2 - 10) 3 (1 - 6) NA NA
Presence of Tophi NA NA NA NA 310 (42.8) 64 (41.3) NA NA 314 (58.3) 47 (63.5) NA NA 135 (17.4) 5 (7.7) 33 (14.9) 1 (6.2) 51 (21.9) 3 (37.5) 169 (99.4) 4 (100.0) 26 (25.5) 5 (55.6) 163 (36.3) 30 (24.4) NA NA 192 (45.6) 16 (29.1) NA NA
PRS - Imputed 5.6 ± 0.7 5.5 ± 0.7 5.1 ± 0.7 5.1 ± 0.7 5.6 ± 0.7 5.5 ± 0.7 5.2 ± 0.7 5.2 ± 0.7 5.6 ± 0.7 5.5 ± 0.6 5.5 ± 0.7 5.3 ± 0.7 5.7 ± 0.7 5.6 ± 0.7 5.7 ± 0.7 5.8 ± 0.7 5.8 ± 0.6 6.0 ± 0.7 5.8 ± 0.6 5.7 ± 0.4 5.7 ± 0.7 5.8 ± 0.3 NA NA NA NA NA NA NA NA
PRS - Direct 4.2 ± 0.6 4.1 ± 0.6 3.8 ± 0.6 3.8 ± 0.6 4.2 ± 0.6 4.2 ± 0.6 3.9 ± 0.6 3.8 ± 0.6 4.2 ± 0.6 4.1 ± 0.6 4.1 ± 0.7 3.9 ± 0.6 4.3 ± 0.7 4.3 ± 0.6 4.4 ± 0.6 4.4 ± 0.6 4.4 ± 0.6 4.7 ± 0.8 4.4 ± 0.6 4.3 ± 0.5 4.3 ± 0.6 4.5 ± 0.3 4.5 ± 0.5 4.5 ± 0.5 4.4 ± 0.4 4.3 ± 0.5 5.0 ± 0.6 4.9 ± 0.6 4.5 ± 0.6 4.5 ± 0.5
BMI 30.4 ± 4.7 32.2 ± 6.6 27.8 ± 4.2 27.0 ± 5.1 30.2 ± 5.3 30.6 ± 7.3 27.0 ± 4.4 26.8 ± 6.1 29.4 ± 4.7 31.1 ± 6.8 NA NA 34.3 ± 6.7 38.2 ± 10.4 34.3 ± 6.3 38.1 ± 6.5 33.8 ± 6.1 36.2 ± 6.9 32.1 ± 5.4 36.5 ± 3.8 31.1 ± 4.9 35.7 ± 8.1 35.6 ± 7.9 38.3 ± 9.1 31.9 ± 7.0 32.5 ± 8.6 36.0 ± 6.7 38.8 ± 9.1 33.2 ± 6.2 34.3 ± 7.6
Type 2 Diabetes 1275 (20.0) 186 (24.4) 11687 (8.8) 7637 (4.8) 137 (15.9) 50 (27.9) 49 (11.9) 42 (13.4) 343 (41.3) 53 (55.8) NA NA 74 (9.5) 16 (24.6) 30 (13.5) 6 (37.5) 32 (13.7) 1 (12.5) 22 (12.9) 2 (50.0) 12 (11.8) 0 (0.0) 170 (36.9) 74 (53.6) 66 (25.6) 84 (21.2) 85 (19.1) 29 (47.5) 37 (17.5) 53 (26.2)
Note: Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).
# Trying to make datatable version of this

datatable(data = table1,
          colnames = clean_names, 
          escape = F,
          caption = "Flares reported as median (inter-quartile range). All other numeric variables reported as mean ± sd. All binary variables reported as N (%).") # still can't easily freeze rownames


Now, these are two plots of the PRS distribution (for imputed and genotyped respectively). These can be supplementary figures.

# PRS - imputed
all_pheno_prs %>% 
  filter(!str_detect(COHORT2, "Polynesian")) %>% 
  ggplot(aes(x = PRS1, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Gout PRS - Imputed") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_y_discrete(limits = rev(levels(all_pheno_prs %>% filter(!str_detect(COHORT2, "Polynesian")) %>% mutate(COHORT2 = factor(COHORT2)) %>% pull(COHORT2)))) +
    scale_color_discrete(type = c("#1e6b52", "#aa9767"))

# PRS - direct
all_pheno_prs %>% 
  ggplot(aes(x = PRS2, y = COHORT2, color = SEX)) +
    geom_boxplot(position = position_dodge2(reverse = T)) +
    labs(x = "Gout PRS - Directly Genotyped") +
    theme(axis.title.y = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.minor.y = element_blank(),
          legend.title = element_blank()) + 
    scale_y_discrete(limits = rev(levels(all_pheno_prs$COHORT2))) +
    scale_color_discrete(type = c("#1e6b52", "#aa9767"))


These next two plots are the manhattan plots for the gout GWAS showing the locations of the SNPs that were used in either the imputed PRS or the genotyped PRS.

# Manhattan plot of UKBB gout GWAS
# Preparing data
if(file.exists(here("Output/Temp/ManhattanData.RData"))){
  load(here("Output/Temp/ManhattanData.RData"))
} else {
  load(here("Output/Temp/biallelic_sumstat_final.RData"))

  GOUT_pValues <- biallelic_sumstat_final %>% 
    filter(P < 0.01) %>% 
    arrange(CHR, BP)
  
  tmp <- GOUT_pValues %>% 
    filter(RSID %in% UKBB_Gene_OR$RSID) %>% 
    mutate("Gene" = UKBB_Gene_OR$Locus_Name)
  
  tmp2 <- GOUT_pValues %>% 
    filter(!(SNP %in% tmp$SNP)) %>% 
    mutate("Gene" = NA)
  
  GOUT_pValues <- full_join(tmp, tmp2) %>% 
    arrange(CHR, BP)
  
  GOUT_pValues2 <- GOUT_pValues %>% 
    
    # Compute chromosome size
    group_by(CHR) %>% 
    summarise(chr_len = max(BP)) %>% 
    
    # Calculate cumulative position of each chromosome
    mutate(tot = cumsum(chr_len) - chr_len) %>%
    select(-chr_len) %>%
    
    # Add this info to the initial dataset
    left_join(GOUT_pValues, ., by = "CHR") %>%
    
    # Add a cumulative position of each SNP
    arrange(CHR, BP) %>%
    mutate(BPcum = BP + tot) %>%
    
    # Add highlight and annotation information
    mutate(is_highlight = ifelse(RSID %in% UKBB_Gene_OR$RSID, "yes", "no")) %>%
    mutate(is_annotate = ifelse(RSID %in% UKBB_Gene_OR$RSID, "yes", "no")) 
  
  save(GOUT_pValues2, file = here("Output/Temp/ManhattanData.RData"))
}




axisdf <- GOUT_pValues2 %>% 
  group_by(CHR) %>% 
  summarize(center = (max(BPcum) + min(BPcum)) / 2)



ggplot(GOUT_pValues2, aes(x = BPcum, y = -log10(P))) +
  
  # Show all points
  geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
  scale_color_manual(values = rep(c("#1e6b52", "#aa9767"), 22)) +
  
  geom_hline(yintercept = -log10(5e-8), colour = "red") +
  
  # custom X axis:
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 250)) +     # remove space between plot area and x axis
  
  # Add highlighted points
  geom_point(data = subset(GOUT_pValues2, is_highlight == "yes"), color = "orange", size = 2) +
  
  # Add label using ggrepel to avoid overlapping
  geom_label_repel(aes(label = Gene), size = 3, box.padding = 0.5, force_pull = 0.5, nudge_y = 3, max.overlaps = Inf) +
  
  xlab("Chromosome") +
  
  ggtitle("UK Biobank Gout GWAS Results - Imputed SNPs") +
  
  # Custom the theme:
  theme_bw() +
  theme( 
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

# Now same for directly genotyped SNPs only
if(file.exists(here("Output/Temp/ManhattanData1.RData"))){
  load(here("Output/Temp/ManhattanData1.RData"))
} else {
  load(here("Output/Temp/biallelic_sumstat_final.RData"))

  direct_snps <- read_delim("/Volumes/archive/merrimanlab/raid_backup/New_Zealand_Chip_data/CoreExome/QC_MergedBatches/Final_Data/CZ-MB1.2-QC1.10_CoreExome24-1.0-3_genotyped-QCd_rsIDconverted.bim", delim = "\t", col_names = F) %>% 
  mutate(CHR_BP = paste0(X1, "_", X4))

  GOUT_pValues <- biallelic_sumstat_final %>% 
    mutate(CHR_BP = paste0(CHR, "_", BP)) %>% 
    filter(P < 0.01,
           CHR_BP %in% direct_snps$CHR_BP) %>% 
    arrange(CHR, BP)
  
  tmp <- GOUT_pValues %>% 
    filter(RSID %in% Poly_Gene_OR$RSID) %>% 
    mutate("Gene" = Poly_Gene_OR$Locus_Name)
  
  tmp2 <- GOUT_pValues %>% 
    filter(!(SNP %in% tmp$SNP)) %>% 
    mutate("Gene" = NA)
  
  GOUT_pValues <- full_join(tmp, tmp2) %>% 
    arrange(CHR, BP)
  
  GOUT_pValues2 <- GOUT_pValues %>% 
    
    # Compute chromosome size
    group_by(CHR) %>% 
    summarise(chr_len = max(BP)) %>% 
    
    # Calculate cumulative position of each chromosome
    mutate(tot = cumsum(chr_len) - chr_len) %>%
    select(-chr_len) %>%
    
    # Add this info to the initial dataset
    left_join(GOUT_pValues, ., by = "CHR") %>%
    
    # Add a cumulative position of each SNP
    arrange(CHR, BP) %>%
    mutate(BPcum = BP + tot) %>%
    
    # Add highlight and annotation information
    mutate(is_highlight = ifelse(RSID %in% Poly_Gene_OR$RSID, "yes", "no")) %>%
    mutate(is_annotate = ifelse(RSID %in% Poly_Gene_OR$RSID, "yes", "no"))  
  
  save(GOUT_pValues2, file = here("Output/Temp/ManhattanData1.RData"))
}

axisdf <- GOUT_pValues2 %>% 
  group_by(CHR) %>% 
  summarize(center = (max(BPcum) + min(BPcum)) / 2)

ggplot(GOUT_pValues2, aes(x = BPcum, y = -log10(P))) +
  
  # Show all points
  geom_point(aes(color = as.factor(CHR)), alpha = 0.8, size = 1.3) +
  scale_color_manual(values = rep(c("#1e6b52", "#aa9767"), 22)) +
  
  geom_hline(yintercept = -log10(5e-8), colour = "red") +
  
  # custom X axis:
  scale_x_continuous(label = axisdf$CHR, breaks = axisdf$center) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 250)) +     # remove space between plot area and x axis
  
  # Add highlighted points
  geom_point(data = subset(GOUT_pValues2, is_highlight == "yes"), color = "orange", size = 2) +
  
  # Add label using ggrepel to avoid overlapping
  geom_label_repel(aes(label = Gene), size = 3, box.padding = 0.5, force_pull = 0.5, nudge_y = 3, max.overlaps = Inf) +
  
  xlab("Chromosome") +
  
  ggtitle("UK Biobank Gout GWAS Results - Directly Genotyped SNPs") +
  
  # Custom the theme:
  theme_bw() +
  theme( 
    legend.position = "none",
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

rm(axisdf, GOUT_pValues2)


The below forest plots show the main effects of the two different PRS on gout in each respective model.

tmp <- GoutModels %>% 
  filter(Predictor == "PRS1",
         !str_detect(Covariates, "AGECOL"),
         !str_detect(Cohort, "GlobalGout")) %>% 
  separate(Cohort, into = c("Cohort", "Sex"), sep = " - ")

gout <- metagen(TE = `log-odds`, 
                seTE = SE, 
                studlab = Cohort,
                byvar = Sex,
                data = tmp, 
                sm = "OR")

tmp <- GoutModels %>% 
  filter(Predictor == "PRS1",
         str_detect(Covariates, "AGECOL"),
         !str_detect(Cohort, "GlobalGout")) %>% 
  separate(Cohort, into = c("Cohort", "Sex"), sep = " - ")

gout_adj <- metagen(TE = `log-odds`, 
                    seTE = SE, 
                    studlab = Cohort,
                    byvar = Sex,
                    data = tmp, 
                    sm = "OR")

forest(gout, 
       xlim = c(1, 3),
       at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Effect on Gout per unit Imputed PRS", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

forest(gout_adj, 
       xlim = c(1, 3),
       at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Effect on Gout per unit Imputed PRS", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

tmp <- GoutModels %>% 
  filter(Predictor == "PRS2",
         !str_detect(Covariates, "AGECOL"),
         !str_detect(Cohort, "GlobalGout")) %>% 
  separate(Cohort, into = c("Cohort", "Sex"), sep = " - ") %>% 
  mutate(Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
                           !str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
                           str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "Polynesian Male",
                           str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "Polynesian Female"))

gout <- metagen(TE = `log-odds`, 
                seTE = SE, 
                studlab = Cohort,
                byvar = Label,
                data = tmp, 
                sm = "OR")

tmp <- GoutModels %>% 
  filter(Predictor == "PRS2",
         str_detect(Covariates, "AGECOL"),
         !str_detect(Cohort, "GlobalGout")) %>% 
  separate(Cohort, into = c("Cohort", "Sex"), sep = " - ") %>% 
  mutate(Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
                           !str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
                           str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "Polynesian Male",
                           str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "Polynesian Female"))

gout_adj <- metagen(TE = `log-odds`, 
                    seTE = SE, 
                    studlab = Cohort,
                    byvar = Label,
                    data = tmp, 
                    sm = "OR")

forest(gout, 
       xlim = c(1, 8),
       at = c(1.0, 3.0, 6.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Effect on Gout per unit Genotyped PRS", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

forest(gout_adj, 
       xlim = c(1, 8),
       at = c(1.0, 3.0, 6.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Effect on Gout per unit Genotyped PRS", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)


The following forest plots are for the effect of both PRS on either age at onset or tophi, with tophi models additionally adjusted for duration.

# Age at Onset
tmp <- OnsetModels %>% 
  filter(Predictor == "PRS1",
         N > 20) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"),
         Beta = case_when(is.na(LCL) ~ NA_real_,
                          TRUE ~ Beta))

onset <- metagen(TE = Beta, 
                 seTE = SE, 
                 studlab = Cohort,
                 byvar = Sex,
                 data = tmp)

forest(onset, 
       xlim = c(-15, 5),
       #at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Δ Age at Onset per unit Imputed PRS", "N"),
       rightlabs = c("Beta (years)", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

tmp <- OnsetModels %>% 
  filter(Predictor == "PRS2") %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"),
         Beta = case_when(is.na(LCL) ~ NA_real_,
                          TRUE ~ Beta),
         Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
                           !str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
                           str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "Polynesian Male",
                           str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "Polynesian Female"))

onset <- metagen(TE = Beta, 
                 seTE = SE, 
                 studlab = Cohort,
                 byvar = Label,
                 data = tmp)

#pdf(file = here("Output/test.pdf"), width = 8, height = 8)

forest(onset, 
       xlim = c(-15, 5),
       #at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Δ Age at Onset per unit Genotyped PRS", "N"),
       rightlabs = c("Beta (years)", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

#dev.off()


# Tophi
tmp <- TophiModels %>% 
  filter(Predictor == "PRS1",
         !str_detect(Covariates, "DURATION")) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"),
         `log-odds` = case_when(LCL == 0 ~ NA_real_,
                        TRUE ~ `log-odds`))

tophi <- metagen(TE = `log-odds`, 
                 seTE = SE, 
                 studlab = Cohort,
                 byvar = Sex,
                 data = tmp, 
                 sm = "OR")

tmp <- TophiModels %>% 
  filter(Predictor == "PRS1",
         str_detect(Covariates, "DURATION")) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"),
         `log-odds` = case_when(LCL == 0 ~ NA_real_,
                        TRUE ~ `log-odds`))

tophi_adj <- metagen(TE = `log-odds`, 
                     seTE = SE, 
                     studlab = Cohort,
                     byvar = Sex,
                     data = tmp, 
                     sm = "OR")

forest(tophi, 
       xlim = c(0.3, 3),
       #at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Tophi per unit Imputed PRS", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

forest(tophi_adj, 
       xlim = c(0.3, 3),
       #at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Tophi per unit Imputed PRS + Disease Duration", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

tmp <- TophiModels %>% 
  filter(Predictor == "PRS2",
         !str_detect(Covariates, "DURATION")) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"),
         `log-odds` = case_when(LCL == 0 ~ NA_real_,
                        TRUE ~ `log-odds`),
         Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
                           !str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
                           str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "Polynesian Male",
                           str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "Polynesian Female"))

tophi <- metagen(TE = `log-odds`, 
                 seTE = SE, 
                 studlab = Cohort,
                 byvar = Label,
                 data = tmp, 
                 sm = "OR")

tmp <- TophiModels %>% 
  filter(Predictor == "PRS2",
         str_detect(Covariates, "DURATION")) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"),
         `log-odds` = case_when(LCL == 0 ~ NA_real_,
                        TRUE ~ `log-odds`),
         Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
                           !str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
                           str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "Polynesian Male",
                           str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "Polynesian Female"))

tophi_adj <- metagen(TE = `log-odds`, 
                     seTE = SE, 
                     studlab = Cohort,
                     byvar = Label,
                     data = tmp, 
                     sm = "OR")

forest(tophi, 
       xlim = c(0.3, 3),
       #at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Tophi per unit Genotyped PRS", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

forest(tophi_adj, 
       xlim = c(0.3, 3),
       #at = c(1.0, 2.0, 3.0), 
       print.tau2 = F,
       print.byvar = F, 
       col.square.lines = "black", 
       col.diamond.lines = "black",
       col.diamond = "#1e6b52",
       col.square = "#aa9767",
       col.study = "gray60",
       col.by = "black",
       leftcols = c("studlab", "N case", "N control"),
       rightcols = c("effect", "ci"),
       leftlabs = c("Tophi per unit Genotyped PRS + Disease Duration", "N case", "N control"),
       rightlabs = c("", "[95% CI]"),
       addrows.below.overall = 2, 
       comb.random = F,
       overall = F)

# Flares
# tmp <- FlareModels1 %>% 
#   filter(Predictor == "PRS1") %>% 
#   mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
#                          str_detect(Cohort, "Female") ~ "Female"),
#          Cohort = str_remove(Cohort, " - Male| - Female"),
#          `log-odds` = case_when(LCL == 0 ~ NA_real_,
#                         TRUE ~ `log-odds`))
# 
# flares <- metagen(TE = `log-odds`, 
#                  seTE = SE, 
#                  studlab = Cohort,
#                  byvar = Sex,
#                  data = tmp, 
#                  sm = "OR")
# 
# forest(flares, 
#        #xlim = c(0.3, 3),
#        #at = c(1.0, 2.0, 3.0), 
#        print.tau2 = F,
#        print.byvar = F, 
#        col.square.lines = "black", 
#        col.diamond.lines = "black",
#        col.diamond = "#1e6b52",
#        col.square = "#aa9767",
#        col.study = "gray60",
#        col.by = "black",
#        leftcols = c("studlab", "N"),
#        rightcols = c("effect", "ci"),
#        leftlabs = c("Flare category per unit Imputed PRS", "N"),
#        rightlabs = c("", "[95% CI]"),
#        addrows.below.overall = 2, 
#        comb.random = F,
#        overall = F)
# 
# 
# 
# tmp <- FlareModels1 %>% 
#   filter(Predictor == "PRS2") %>% 
#   mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
#                          str_detect(Cohort, "Female") ~ "Female"),
#          Cohort = str_remove(Cohort, " - Male| - Female"),
#          `log-odds` = case_when(LCL == 0 ~ NA_real_,
#                         TRUE ~ `log-odds`),
#          Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
#                            !str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
#                            str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "Polynesian Male",
#                            str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "Polynesian Female"))
# 
# flares <- metagen(TE = `log-odds`, 
#                  seTE = SE, 
#                  studlab = Cohort,
#                  byvar = Label,
#                  data = tmp, 
#                  sm = "OR")
# 
# forest(flares, 
#        #xlim = c(0.3, 3),
#        #at = c(1.0, 2.0, 3.0), 
#        print.tau2 = F,
#        print.byvar = F, 
#        col.square.lines = "black", 
#        col.diamond.lines = "black",
#        col.diamond = "#1e6b52",
#        col.square = "#aa9767",
#        col.study = "gray60",
#        col.by = "black",
#        leftcols = c("studlab", "N"),
#        rightcols = c("effect", "ci"),
#        leftlabs = c("Flares per unit Genotyped PRS", "N"),
#        rightlabs = c("", "[95% CI]"),
#        addrows.below.overall = 2, 
#        comb.random = F,
#        overall = F)
# 
# 
# 
# # Flares2
# tmp <- FlareModels2 %>% 
#   filter(Predictor == "PRS1") %>% 
#   mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
#                          str_detect(Cohort, "Female") ~ "Female"),
#          Cohort = str_remove(Cohort, " - Male| - Female"),
#          `log-odds` = case_when(LCL == 0 ~ NA_real_,
#                         TRUE ~ `log-odds`))
# 
# flares <- metagen(TE = `log-odds`, 
#                  seTE = SE, 
#                  studlab = Cohort,
#                  byvar = Sex,
#                  data = tmp, 
#                  sm = "OR")
# 
# forest(flares, 
#        #xlim = c(0.3, 3),
#        #at = c(1.0, 2.0, 3.0), 
#        print.tau2 = F,
#        print.byvar = F, 
#        col.square.lines = "black", 
#        col.diamond.lines = "black",
#        col.diamond = "#1e6b52",
#        col.square = "#aa9767",
#        col.study = "gray60",
#        col.by = "black",
#        leftcols = c("studlab", "N"),
#        rightcols = c("effect", "ci"),
#        leftlabs = c("Flare category per unit Imputed PRS", "N"),
#        rightlabs = c("", "[95% CI]"),
#        addrows.below.overall = 2, 
#        comb.random = F,
#        overall = F)
# 
# 
# 
# tmp <- FlareModels2 %>% 
#   filter(Predictor == "PRS2") %>% 
#   mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
#                          str_detect(Cohort, "Female") ~ "Female"),
#          Cohort = str_remove(Cohort, " - Male| - Female"),
#          `log-odds` = case_when(LCL == 0 ~ NA_real_,
#                         TRUE ~ `log-odds`),
#          Label = case_when(!str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "European Male",
#                            !str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "European Female",
#                            str_detect(Cohort, "Polynesian") & Sex == "Male" ~ "Polynesian Male",
#                            str_detect(Cohort, "Polynesian") & Sex == "Female" ~ "Polynesian Female"))
# 
# flares <- metagen(TE = `log-odds`, 
#                  seTE = SE, 
#                  studlab = Cohort,
#                  byvar = Label,
#                  data = tmp, 
#                  sm = "OR")
# 
# forest(flares, 
#        #xlim = c(0.3, 3),
#        #at = c(1.0, 2.0, 3.0), 
#        print.tau2 = F,
#        print.byvar = F, 
#        col.square.lines = "black", 
#        col.diamond.lines = "black",
#        col.diamond = "#1e6b52",
#        col.square = "#aa9767",
#        col.study = "gray60",
#        col.by = "black",
#        leftcols = c("studlab", "N"),
#        rightcols = c("effect", "ci"),
#        leftlabs = c("Flares per unit Genotyped PRS", "N"),
#        rightlabs = c("", "[95% CI]"),
#        addrows.below.overall = 2, 
#        comb.random = F,
#        overall = F)


These plots now aim to display the variants that were individually significant after multiple testing correction for each of the PRS models that were significant at the 0.05 level. Where appropriate, these were tested via meta-analysis to reflect the results from the full PRS models.

# The below list indicates which overall PRS models were significant, to inform which individual variants should be tested

# Gout vs Imputed PRS (unadj and adj for AGECOL) = European Males and European Females
# 
# Gout vs Genotyped PRS (unadj and adj for AGECOL) = European Males, European Females, East Poly Males, West Poly Males, East Poly Females, and West Poly Females
# 
# Onset vs Imputed PRS (unadj) = European Males only
# 
# Onset vs Genotyped PRS (unadj) = European Males, East Polynesian Males, and West Polynesian Males
# 
# Tophi vs Imputed PRS (unadj) = European Males only
# 
# Tophi vs Imputed PRS (adj for DURATION) = European Males only (borderline)
# 
# Tophi vs Genotyped PRS (unadj) = European Males, East Polynesian Males, and West Polynesian Males
# 
# Tophi vs Genotyped PRS (adj for DURATION) = East Polynesian Males only
# 
# Flares (all categories) vs Imputed PRS (unadj) = All Non-Sig
# 
# Flares (all cat) vs Genotyped PRS (unadj) = Fixed Poly Male (but not individually)
# 
# Flares (> 2 categories) vs Imputed PRS (unadj) = All Non-Sig
# 
# Flares (> 2 cat) vs Genotyped PRS (unadj) = West Poly Male and Fixed Poly Male

# Preparing data
tmp <- UKBB_Gene_OR %>% 
  select(CHR, BP, Locus_Name, RSID) %>% 
  mutate(Label = paste0(Locus_Name, " (", RSID, ")"))

tmp2 <- Poly_Gene_OR %>% 
  select(CHR, BP, Locus_Name, RSID) %>% 
  mutate(Label = paste0(Locus_Name, " (", RSID, ")"))

tmp <- rbind(tmp, tmp2) %>% 
  unique() %>% 
  select(-Locus_Name) %>% 
  arrange(CHR, BP)

tmp <- rbind(list("CHR" = NA_real_, "BP" = NA_real_, "RSID" = "PRS1", "Label" = "Imputed PRS"), 
             list("CHR" = NA_real_, "BP" = NA_real_, "RSID" = "PRS2", "Label" = "Genotyped PRS")) %>% 
  rbind(tmp) %>% 
  mutate(CHR = as.numeric(CHR),
         BP = as.numeric(BP),
         RSID = as.character(RSID),
         Label = factor(Label, levels = c("Imputed PRS", "Genotyped PRS", tmp$Label))) %>% 
  select(RSID, Label)

GoutModels2 <- GoutModels %>% 
  mutate(Predictor = as.character(Predictor)) %>% 
  left_join(tmp, by = c("Predictor" = "RSID")) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"),
         Cohort = factor(Cohort, levels = c("West Polynesian", "East Polynesian", "GlobalGout", "Aus/NZ European", "UK Biobank")))

OnsetModels2 <- OnsetModels %>% 
  mutate(Predictor = as.character(Predictor)) %>% 
  left_join(tmp, by = c("Predictor" = "RSID")) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"))

TophiModels2 <- TophiModels %>% 
  mutate(Predictor = as.character(Predictor)) %>% 
  left_join(tmp, by = c("Predictor" = "RSID")) %>% 
  mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
                         str_detect(Cohort, "Female") ~ "Female"),
         Cohort = str_remove(Cohort, " - Male| - Female"))

# FlareModels3 <- FlareModels1 %>% 
#   mutate(Predictor = as.character(Predictor)) %>% 
#   left_join(tmp, by = c("Predictor" = "RSID")) %>% 
#   mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
#                          str_detect(Cohort, "Female") ~ "Female"),
#          Cohort = str_remove(Cohort, " - Male| - Female"))
# 
# FlareModels4 <- FlareModels2 %>% 
#   mutate(Predictor = as.character(Predictor)) %>% 
#   left_join(tmp, by = c("Predictor" = "RSID")) %>% 
#   mutate(Sex = case_when(str_detect(Cohort, "Male") ~ "Male",
#                          str_detect(Cohort, "Female") ~ "Female"),
#          Cohort = str_remove(Cohort, " - Male| - Female"))



# Imputed PRS Gout models ------------------------------------------------
# Unadjusted
tmp <- GoutModels %>%
  filter(Predictor %in% UKBB_Gene_OR$RSID,
         str_detect(Cohort, "Aus"),
         str_detect(Cohort, "Male"),
         !str_detect(Covariates, "AGECOL"),
         !str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(UKBB_Gene_OR))

GoutModels2 %>%
  filter(Predictor %in% tmp$Predictor,
         Sex == "Male",
         !str_detect(Cohort, "Polynesian|GlobalGout"),
         !str_detect(Covariates, "AGECOL"),
         !str_detect(Predictors, "rs10910845")) %>% 
  ggplot(aes(x = OR, y = Label, color = Cohort)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), position = position_dodge(width = 0.5)) +
  labs(x = "Odds Ratio for Gout per Allele (95% CI)",
       title = "Gout vs Imputed Variants (Males Only) - Unadjusted",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 1, color = "black") +
  scale_x_continuous(trans = "log2") +
  scale_y_discrete(limits = rev) +
  scale_color_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank())

tmp <- GoutModels %>%
  filter(Predictor %in% UKBB_Gene_OR$RSID,
         str_detect(Cohort, "Aus"),
         !str_detect(Cohort, "Male"),
         !str_detect(Covariates, "AGECOL"),
         !str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(UKBB_Gene_OR))

GoutModels2 %>%
  filter(Predictor %in% tmp$Predictor,
         Sex == "Female",
         !str_detect(Cohort, "Polynesian|GlobalGout"),
         !str_detect(Covariates, "AGECOL"),
         !str_detect(Predictors, "rs10910845")) %>% 
  ggplot(aes(x = OR, y = Label, color = Cohort)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), position = position_dodge(width = 0.5)) +
  labs(x = "Odds Ratio for Gout per Allele (95% CI)",
       title = "Gout vs Imputed Variants (Females Only) - Unadjusted",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 1, color = "black") +
  scale_x_continuous(trans = "log2") +
  scale_y_discrete(limits = rev) +
  scale_color_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank())

# Adjusted
tmp <- GoutModels %>%
  filter(Predictor %in% UKBB_Gene_OR$RSID,
         str_detect(Cohort, "Aus"),
         str_detect(Cohort, "Male"),
         str_detect(Covariates, "AGECOL"),
         !str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(UKBB_Gene_OR))

GoutModels2 %>%
  filter(Predictor %in% tmp$Predictor,
         Sex == "Male",
         !str_detect(Cohort, "Polynesian|GlobalGout"),
         str_detect(Covariates, "AGECOL"),
         !str_detect(Predictors, "rs10910845")) %>% 
  ggplot(aes(x = OR, y = Label, color = Cohort)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), position = position_dodge(width = 0.5)) +
  labs(x = "Odds Ratio for Gout per Allele (95% CI)",
       title = "Gout vs Imputed Variants (Males Only) - Adjusted for Age at Collection",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 1, color = "black") +
  scale_x_continuous(trans = "log2") +
  scale_y_discrete(limits = rev) +
  scale_color_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank())

tmp <- GoutModels %>%
  filter(Predictor %in% UKBB_Gene_OR$RSID,
         str_detect(Cohort, "Aus"),
         !str_detect(Cohort, "Male"),
         str_detect(Covariates, "AGECOL"),
         !str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(UKBB_Gene_OR)) # empty



# Genotyped PRS gout models ---------------------------------------------
# Unadjusted
tmp <- GoutModels %>%
  filter(Predictor %in% Poly_Gene_OR$RSID,
         !str_detect(Cohort, "UK|Global"),
         str_detect(Cohort, "Male"),
         !str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(Poly_Gene_OR))

GoutModels2 %>%
  filter(Predictor %in% tmp$Predictor,
         !str_detect(Cohort, "Global"),
         Sex == "Male",
         !str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845")) %>% 
  ggplot(aes(x = OR, y = Label, color = Cohort)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), position = position_dodge(width = 0.5)) +
  labs(x = "Odds Ratio for Gout per Allele (95% CI)",
       title = "Gout vs Genotyped Variants (Males Only) - Unadjusted",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 1, color = "black") +
  scale_x_continuous(trans = "log2") +
  scale_y_discrete(limits = rev) +
  scale_color_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank())

tmp <- GoutModels %>%
  filter(Predictor %in% Poly_Gene_OR$RSID,
         !str_detect(Cohort, "UK|Global"),
         !str_detect(Cohort, "Male"),
         !str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(Poly_Gene_OR))

GoutModels2 %>%
  filter(Predictor %in% tmp$Predictor,
         !str_detect(Cohort, "Global"),
         Sex == "Female",
         !str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845")) %>% 
  ggplot(aes(x = OR, y = Label, color = Cohort)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), position = position_dodge(width = 0.5)) +
  labs(x = "Odds Ratio for Gout per Allele (95% CI)",
       title = "Gout vs Genotyped Variants (Females Only) - Unadjusted",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 1, color = "black") +
  scale_x_continuous(trans = "log2") +
  scale_y_discrete(limits = rev) +
  scale_color_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank())

# Adjusted
tmp <- GoutModels %>%
  filter(Predictor %in% Poly_Gene_OR$RSID,
         !str_detect(Cohort, "UK|Global"),
         str_detect(Cohort, "Male"),
         str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(Poly_Gene_OR))

GoutModels2 %>%
  filter(Predictor %in% tmp$Predictor,
         !str_detect(Cohort, "Global"),
         Sex == "Male",
         str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845")) %>% 
  ggplot(aes(x = OR, y = Label, color = Cohort)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), position = position_dodge(width = 0.5)) +
  labs(x = "Odds Ratio for Gout per Allele (95% CI)",
       title = "Gout vs Genotyped Variants (Males Only) - Adjusted for Age at Collection",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 1, color = "black") +
  scale_x_continuous(trans = "log2") +
  scale_y_discrete(limits = rev) +
  scale_color_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank())

tmp <- GoutModels %>%
  filter(Predictor %in% Poly_Gene_OR$RSID,
         !str_detect(Cohort, "UK|Global"),
         !str_detect(Cohort, "Male"),
         str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845"),
         Pval < 0.05 / nrow(Poly_Gene_OR))

GoutModels2 %>%
  filter(Predictor %in% tmp$Predictor,
         !str_detect(Cohort, "Global"),
         Sex == "Female",
         str_detect(Covariates, "AGECOL"),
         str_detect(Predictors, "rs10910845")) %>%
  ggplot(aes(x = OR, y = Label, color = Cohort)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), position = position_dodge(width = 0.5)) +
  labs(x = "Odds Ratio for Gout per Allele (95% CI)",
       title = "Gout vs Genotyped Variants (Females Only) - Adjusted for Age at Collection",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 1, color = "black") +
  scale_x_continuous(trans = "log2") +
  scale_y_discrete(limits = rev) +
  scale_color_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank())

# Onset models ------------------------------------------------------------
# Imputed PRS SNPs in European Males
snps <- UKBB_Gene_OR$RSID

out <- data.frame("SNP" = "NA", "Beta" = NA_real_, "LCL" = NA_real_, "UCL" = NA_real_, "P" = NA_real_) %>% 
  slice(-1)
for(i in seq_along(snps)){
  tmp <- OnsetModels2 %>% 
    filter(Sex == "Male",
           !str_detect(Cohort, "Polynesian"),
           !str_detect(Predictors, "rs10910845"),
           Predictor == snps[i])
  
  onset <- metagen(TE = Beta, 
                   seTE = SE, 
                   studlab = Cohort,
                   data = tmp)
  
  out <- rbind(out, list("SNP" = snps[i], "Beta" = onset$TE.fixed, "LCL" = onset$lower.fixed, "UCL" = onset$upper.fixed, "P" = onset$pval.fixed))
}

tmp <- OnsetModels2 %>% 
  select(Predictor, Label)

out %>% 
  filter(P < 0.05 / length(snps)) %>% 
  left_join(tmp, by = c("SNP" = "Predictor")) %>% 
  ggplot(aes(x = Beta, y = Label)) +
  geom_pointrange(aes(xmin = LCL, xmax = UCL), color = "darkgreen") +
  labs(x = "Change in Age at Onset (years) per Allele (95% CI)",
       title = "Onset vs Imputed Variants (Males Only)",
       subtitle = "Showing Variants with Significance after Bonferroni Correction") +
  geom_vline(xintercept = 0, color = "black") +
  scale_y_discrete(limits = rev) +
  theme(axis.title.y = element_blank(),
        legend.title = element_blank(),
        plot.title.position = "plot")

# Genotyped SNPs in all three male cohorts
snps <- Poly_Gene_OR$RSID

out <- data.frame("SNP" = "NA", "Beta" = NA_real_, "LCL" = NA_real_, "UCL" = NA_real_, "P" = NA_real_) %>% 
  slice(-1)
for(i in seq_along(snps)){
  tmp <- OnsetModels2 %>% 
    filter(Sex == "Male",
           !str_detect(Cohort, "Polynesian"),
           str_detect(Predictors, "rs10910845"),
           Predictor == snps[i])
  
  onset <- metagen(TE = Beta, 
                   seTE = SE, 
                   studlab = Cohort,
                   data = tmp)
  
  out <- rbind(out, list("SNP" = snps[i], "Beta" = onset$TE.fixed, "LCL" = onset$lower.fixed, "UCL" = onset$upper.fixed, "P" = onset$pval.fixed))
}

test <- out %>% 
  filter(P < 0.05 / length(snps)) %>% 
  pull(SNP)

test2 <- OnsetModels2 %>% 
  filter(Sex == "Male",
         str_detect(Cohort, "Polynesian"),
         str_detect(Predictors, "rs10910845"),
         Predictor %in% snps,
         Pval < 0.05 / length(snps)) %>% 
  pull(Predictor)

snplist <- c(test, test2) %>%
  unique()

OnsetModels3 <- OnsetModels2 %>% 
  mutate(Group = case_when(str_detect(Cohort, "Polynesian") ~ "Polynesian",
                           TRUE ~ "European"))

for(i in snplist){
  tmp <- OnsetModels3 %>% 
           filter(Sex == "Male",
                  str_detect(Predictors, "rs10910845"), 
                  Predictor == i)
  
  assign(i, metagen(TE = Beta, 
                             seTE = SE, 
                             studlab = Cohort,
                             byvar = Group,
                             data = tmp))
  
  forest(get(i), 
         xlim = c(-15, 5),
         #at = c(1.0, 2.0, 3.0), 
         print.tau2 = F,
         print.byvar = F, 
         col.square.lines = "black", 
         col.diamond.lines = "black",
         col.diamond = "#1e6b52",
         col.square = "#aa9767",
         col.study = "gray60",
         col.by = "black",
         leftcols = c("studlab", "N"),
         rightcols = c("effect", "ci"),
         leftlabs = c(paste0("Δ Age at Onset per ", i, " allele"), "N"),
         rightlabs = c("Beta (years)", "[95% CI]"),
         addrows.below.overall = 2, 
         comb.random = F,
         overall = F)
}

# Tophi models -----------------------------------------------------------
# Tophi vs Imputed PRS (unadj) = European Males only
snps <- UKBB_Gene_OR$RSID

out <- data.frame("SNP" = "NA", "OR" = NA_real_, "LCL" = NA_real_, "UCL" = NA_real_, "P" = NA_real_) %>% 
  slice(-1)
for(i in seq_along(snps)){
  tmp <- TophiModels2 %>% 
    filter(Sex == "Male",
           !str_detect(Cohort, "Polynesian"),
           !str_detect(Covariates, "DURATION"),
           !str_detect(Predictors, "rs10910845"),
           LCL != 0,
           Predictor == snps[i])
  
  tophi <- metagen(TE = `log-odds`, 
                   seTE = SE, 
                   studlab = Cohort,
                   data = tmp,
                   sm = "OR")
  
  out <- rbind(out, list("SNP" = snps[i], "OR" = exp(tophi$TE.fixed), "LCL" = exp(tophi$lower.fixed), "UCL" = exp(tophi$upper.fixed), "P" = tophi$pval.fixed))
}

out %>% 
  filter(P < 0.05 / length(snps)) %>% 
  nrow()
## [1] 0
# Tophi vs Imputed PRS (adj for DURATION) = European Males only (borderline)
out <- data.frame("SNP" = "NA", "OR" = NA_real_, "LCL" = NA_real_, "UCL" = NA_real_, "P" = NA_real_) %>% 
  slice(-1)
for(i in seq_along(snps)){
  tmp <- TophiModels2 %>% 
    filter(Sex == "Male",
           !str_detect(Cohort, "Polynesian"),
           str_detect(Covariates, "DURATION"),
           !str_detect(Predictors, "rs10910845"),
           LCL != 0,
           Predictor == snps[i])
  
  tophi <- metagen(TE = `log-odds`, 
                   seTE = SE, 
                   studlab = Cohort,
                   data = tmp,
                   sm = "OR")
  
  out <- rbind(out, list("SNP" = snps[i], "OR" = exp(tophi$TE.fixed), "LCL" = exp(tophi$lower.fixed), "UCL" = exp(tophi$upper.fixed), "P" = tophi$pval.fixed))
}

out %>% 
  filter(P < 0.05 / length(snps)) %>% 
  nrow() 
## [1] 0
# Tophi vs Genotyped PRS (unadj) = European Males, East Polynesian Males, and West Polynesian Males
snps <- Poly_Gene_OR$RSID

out <- data.frame("SNP" = "NA", "OR" = NA_real_, "LCL" = NA_real_, "UCL" = NA_real_, "P" = NA_real_) %>% 
  slice(-1)
for(i in seq_along(snps)){
  tmp <- TophiModels2 %>% 
    filter(Sex == "Male",
           #!str_detect(Cohort, "Polynesian"),
           !str_detect(Covariates, "DURATION"),
           str_detect(Predictors, "rs10910845"),
           LCL != 0,
           Predictor == snps[i])
  
  tophi <- metagen(TE = `log-odds`, 
                   seTE = SE, 
                   studlab = Cohort,
                   data = tmp,
                   sm = "OR")
  
  out <- rbind(out, list("SNP" = snps[i], "OR" = exp(tophi$TE.fixed), "LCL" = exp(tophi$lower.fixed), "UCL" = exp(tophi$upper.fixed), "P" = tophi$pval.fixed))
}

out %>% 
  filter(P < 0.05 / length(snps)) %>% 
  nrow()
## [1] 0
# Tophi vs Genotyped PRS (adj for DURATION) = East Polynesian Males only
tmp <- TophiModels2 %>% 
  filter(Sex == "Male",
         str_detect(Cohort, "East Polynesian"),
         str_detect(Covariates, "DURATION"),
         str_detect(Predictors, "rs10910845"),
         LCL != 0,
         Predictor %in% snps,
         Pval < 0.05 / length(snps))
 
# Flares (all cat) vs Genotyped PRS (unadj) = Fixed Poly Male (but not individually)
# snps <- Poly_Gene_OR$RSID
# 
# out <- data.frame("SNP" = "NA", "OR" = NA_real_, "LCL" = NA_real_, "UCL" = NA_real_, "P" = NA_real_) %>% 
#   slice(-1)
# for(i in seq_along(snps)){
#   tmp <- FlareModels3 %>% 
#     filter(Sex == "Male",
#            str_detect(Cohort, "Polynesian"),
#            LCL != 0,
#            Predictor == snps[i])
#   
#   tophi <- metagen(TE = `log-odds`, 
#                    seTE = SE, 
#                    studlab = Cohort,
#                    data = tmp,
#                    sm = "OR")
#   
#   out <- rbind(out, list("SNP" = snps[i], "OR" = exp(tophi$TE.fixed), "LCL" = exp(tophi$lower.fixed), "UCL" = exp(tophi$upper.fixed), "P" = tophi$pval.fixed))
# }
# 
# out %>% 
#   filter(P < 0.05 / length(snps)) %>% 
#   nrow()
# 
# 
# # Flares (> 2 cat) vs Genotyped PRS (unadj) = West Poly Male and Fixed Poly Male
# snps <- Poly_Gene_OR$RSID
# 
# out <- data.frame("SNP" = "NA", "OR" = NA_real_, "LCL" = NA_real_, "UCL" = NA_real_, "P" = NA_real_) %>% 
#   slice(-1)
# for(i in seq_along(snps)){
#   tmp <- FlareModels4 %>% 
#     filter(Sex == "Male",
#            str_detect(Cohort, "Polynesian"),
#            LCL != 0,
#            Predictor == snps[i])
#   
#   tophi <- metagen(TE = `log-odds`, 
#                    seTE = SE, 
#                    studlab = Cohort,
#                    data = tmp,
#                    sm = "OR")
#   
#   out <- rbind(out, list("SNP" = snps[i], "OR" = exp(tophi$TE.fixed), "LCL" = exp(tophi$lower.fixed), "UCL" = exp(tophi$upper.fixed), "P" = tophi$pval.fixed))
# }
# 
# out %>% 
#   filter(P < 0.05 / length(snps)) %>% 
#   nrow()